From 72dc048f4fe409d66917da74db6a6712d0389244 Mon Sep 17 00:00:00 2001
From: caoqianming <caoqianming@foxmail.com>
Date: Fri, 7 Nov 2025 11:07:50 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E9=87=87=E7=94=A8catdoc=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81doc=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cp_app.py | 59 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/cp_app.py b/cp_app.py
index 758b140..39eb42d 100644
--- a/cp_app.py
+++ b/cp_app.py
@@ -9,8 +9,15 @@ import os
 import re
 from flask_jwt_extended import JWTManager, create_access_token, jwt_required, get_jwt_identity
 from datetime import timedelta
+import uuid
+import subprocess
+
+class ParseError(Exception):
+    def __init__(self, msg="请求错误"):
+        self.msg = msg
 
 app = Flask(__name__, static_folder='dist/assets', static_url_path='/assets')
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 CORS(app)
 VUE_DIST_DIR = os.path.join(os.path.dirname(__file__), 'dist')
 app.config.update(
@@ -45,6 +52,11 @@ def get_system_info():
 def get_s():
     return get_standard()
 
+@app.route("/api/check_token/", methods=["GET"])
+@jwt_required()
+def check_token():
+    return jsonify(), 200
+
 @app.route('/')
 def index():
     return send_from_directory(VUE_DIST_DIR, 'index.html')
@@ -187,7 +199,10 @@ def cal():
         e_content = content
         e_err_msg = err_msg
         if content:
-            res = ask(f'以下内容为用户报告:  {content}', "tec")
+            try:
+                res = ask(f'以下内容为用户报告:  {content}', "tec")
+            except ParseError as e:
+                return jsonify({"err_msg": e.msg}), 400
             if res == "是":
                 for item in data:
                     if item["firstLevel"] == "二、技术路径（35 分）":
@@ -273,49 +288,43 @@ def ask(input:str, p_name:str, stream=False):
                 "chat_template_kwargs": {"enable_thinking": False}
                 }
     response = requests.post(LLM_URL, headers=HEADERS, json=payload, stream=stream, timeout=(60, 240))
-    print(response.json())
     if not stream:
+        if response.json().get("detail") == "Internal server error":
+            raise ParseError("模型处理错误超过最大token限制")
         return response.json()["choices"][0]["message"]["content"]
-    
+   
+
 def parse_file(file_content, file_type):
     try:
         if file_type == "pdf":
             # 将文件内容转换为字节流
-            pdf_bytes = file_content.read()  
+            pdf_bytes = file_content.read()
             pdf_stream = io.BytesIO(pdf_bytes)
             doc = fitz.open(stream=pdf_stream, filetype="pdf")
             text_content = ""
-            
+
             # 首先尝试直接提取文本
             for page_num in range(len(doc)):
                 page = doc[page_num]
                 text_content += page.get_text() + "\n"
-            
+
             t_plain = text_content.strip()
+            doc.close()
             if t_plain:
-                doc.close()
                 return t_plain, None
             else:
-                # 直接转发字节流
-                # resp = requests.post(OCR_URL,
-                #                     files={"pdf": (file_content.filename,
-                #                                     pdf_stream,
-                #                                     "application/pdf")},
-                #                     timeout=120)          # 大文件酌情加长
-                # resp.raise_for_status()
-                # return resp.json()["full_text"], None
                 return None, "无法直接提取文本，请使用OCR处理"
-            
+
         elif file_type == "docx":
             # 将文件内容转换为字节流
             doc_stream = io.BytesIO(file_content.read())
             doc = Document(doc_stream)
-            
+
             # 提取所有段落的文本
             text_content = ""
             for paragraph in doc.paragraphs:
                 text_content += paragraph.text + "\n"
-                
+
             # 提取表格中的文本
             for table in doc.tables:
                 for row in table.rows:
@@ -323,8 +332,17 @@ def parse_file(file_content, file_type):
                         text_content += cell.text + " "
                     text_content += "\n"
             return text_content, None
-        
-        # 如果需要支持其他文件类型，可以在这里添加处理逻辑
+
+        elif file_type == "doc":
+            file_name = f'{uuid.uuid4()}.doc'
+            file_path = os.path.join(CURRENT_DIR, file_name)
+            file_content.save(file_path)
+            completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
+            os.remove(file_path)
+            if completed.returncode != 0:
+                return None, completed.stderr
+            return completed.stdout, None
+
         elif file_type == "txt":
             text_content = file_content.read().decode("utf-8")
             return text_content, None
@@ -332,6 +350,7 @@ def parse_file(file_content, file_type):
     except Exception as e:
         return None, f"文件解析错误: {str(e)}"
 
+
 if __name__ == "__main__":
     # get_ocr_engine()
     app.run(debug=True, port=3401)