From 72dc048f4fe409d66917da74db6a6712d0389244 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Fri, 7 Nov 2025 11:07:50 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E9=87=87=E7=94=A8catdoc=E6=94=AF?= =?UTF-8?q?=E6=8C=81doc=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cp_app.py | 59 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/cp_app.py b/cp_app.py index 758b140..39eb42d 100644 --- a/cp_app.py +++ b/cp_app.py @@ -9,8 +9,15 @@ import os import re from flask_jwt_extended import JWTManager, create_access_token, jwt_required, get_jwt_identity from datetime import timedelta +import uuid +import subprocess + +class ParseError(Exception): + def __init__(self, msg="请求错误"): + self.msg = msg app = Flask(__name__, static_folder='dist/assets', static_url_path='/assets') +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) CORS(app) VUE_DIST_DIR = os.path.join(os.path.dirname(__file__), 'dist') app.config.update( @@ -45,6 +52,11 @@ def get_system_info(): def get_s(): return get_standard() +@app.route("/api/check_token/", methods=["GET"]) +@jwt_required() +def check_token(): + return jsonify(), 200 + @app.route('/') def index(): return send_from_directory(VUE_DIST_DIR, 'index.html') @@ -187,7 +199,10 @@ def cal(): e_content = content e_err_msg = err_msg if content: - res = ask(f'以下内容为用户报告: {content}', "tec") + try: + res = ask(f'以下内容为用户报告: {content}', "tec") + except ParseError as e: + return jsonify({"err_msg": e.msg}), 400 if res == "是": for item in data: if item["firstLevel"] == "二、技术路径(35 分)": @@ -273,49 +288,43 @@ def ask(input:str, p_name:str, stream=False): "chat_template_kwargs": {"enable_thinking": False} } response = requests.post(LLM_URL, headers=HEADERS, json=payload, stream=stream, timeout=(60, 240)) - print(response.json()) if not stream: + if response.json().get("detail") == "Internal server error": + raise ParseError("模型处理错误超过最大token限制") return response.json()["choices"][0]["message"]["content"] - + + def parse_file(file_content, file_type): try: if file_type == "pdf": # 将文件内容转换为字节流 - pdf_bytes = file_content.read() + pdf_bytes = file_content.read() pdf_stream = io.BytesIO(pdf_bytes) doc = fitz.open(stream=pdf_stream, filetype="pdf") text_content = "" - + # 首先尝试直接提取文本 for page_num in range(len(doc)): page = doc[page_num] text_content += page.get_text() + "\n" - + t_plain = text_content.strip() + doc.close() if t_plain: - doc.close() return t_plain, None else: - # 直接转发字节流 - # resp = requests.post(OCR_URL, - # files={"pdf": (file_content.filename, - # pdf_stream, - # "application/pdf")}, - # timeout=120) # 大文件酌情加长 - # resp.raise_for_status() - # return resp.json()["full_text"], None return None, "无法直接提取文本,请使用OCR处理" - + elif file_type == "docx": # 将文件内容转换为字节流 doc_stream = io.BytesIO(file_content.read()) doc = Document(doc_stream) - + # 提取所有段落的文本 text_content = "" for paragraph in doc.paragraphs: text_content += paragraph.text + "\n" - + # 提取表格中的文本 for table in doc.tables: for row in table.rows: @@ -323,8 +332,17 @@ def parse_file(file_content, file_type): text_content += cell.text + " " text_content += "\n" return text_content, None - - # 如果需要支持其他文件类型,可以在这里添加处理逻辑 + + elif file_type == "doc": + file_name = f'{uuid.uuid4()}.doc' + file_path = os.path.join(CURRENT_DIR, file_name) + file_content.save(file_path) + completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True) + os.remove(file_path) + if completed.returncode != 0: + return None, completed.stderr + return completed.stdout, None + elif file_type == "txt": text_content = file_content.read().decode("utf-8") return text_content, None @@ -332,6 +350,7 @@ def parse_file(file_content, file_type): except Exception as e: return None, f"文件解析错误: {str(e)}" + if __name__ == "__main__": # get_ocr_engine() app.run(debug=True, port=3401)