feat: 采用catdoc支持doc文件

This commit is contained in:
caoqianming 2025-11-07 11:07:50 +08:00
parent 61b3b1f3e5
commit 72dc048f4f
1 changed files with 39 additions and 20 deletions

View File

@ -9,8 +9,15 @@ import os
import re
from flask_jwt_extended import JWTManager, create_access_token, jwt_required, get_jwt_identity
from datetime import timedelta
import uuid
import subprocess
class ParseError(Exception):
def __init__(self, msg="请求错误"):
self.msg = msg
app = Flask(__name__, static_folder='dist/assets', static_url_path='/assets')
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
CORS(app)
VUE_DIST_DIR = os.path.join(os.path.dirname(__file__), 'dist')
app.config.update(
@ -45,6 +52,11 @@ def get_system_info():
def get_s():
return get_standard()
@app.route("/api/check_token/", methods=["GET"])
@jwt_required()
def check_token():
return jsonify(), 200
@app.route('/')
def index():
return send_from_directory(VUE_DIST_DIR, 'index.html')
@ -187,7 +199,10 @@ def cal():
e_content = content
e_err_msg = err_msg
if content:
res = ask(f'以下内容为用户报告: {content}', "tec")
try:
res = ask(f'以下内容为用户报告: {content}', "tec")
except ParseError as e:
return jsonify({"err_msg": e.msg}), 400
if res == "":
for item in data:
if item["firstLevel"] == "二、技术路径35 分)":
@ -273,10 +288,12 @@ def ask(input:str, p_name:str, stream=False):
"chat_template_kwargs": {"enable_thinking": False}
}
response = requests.post(LLM_URL, headers=HEADERS, json=payload, stream=stream, timeout=(60, 240))
print(response.json())
if not stream:
if response.json().get("detail") == "Internal server error":
raise ParseError("模型处理错误超过最大token限制")
return response.json()["choices"][0]["message"]["content"]
def parse_file(file_content, file_type):
try:
if file_type == "pdf":
@ -292,18 +309,10 @@ def parse_file(file_content, file_type):
text_content += page.get_text() + "\n"
t_plain = text_content.strip()
doc.close()
if t_plain:
doc.close()
return t_plain, None
else:
# 直接转发字节流
# resp = requests.post(OCR_URL,
# files={"pdf": (file_content.filename,
# pdf_stream,
# "application/pdf")},
# timeout=120) # 大文件酌情加长
# resp.raise_for_status()
# return resp.json()["full_text"], None
return None, "无法直接提取文本请使用OCR处理"
elif file_type == "docx":
@ -324,7 +333,16 @@ def parse_file(file_content, file_type):
text_content += "\n"
return text_content, None
# 如果需要支持其他文件类型,可以在这里添加处理逻辑
elif file_type == "doc":
file_name = f'{uuid.uuid4()}.doc'
file_path = os.path.join(CURRENT_DIR, file_name)
file_content.save(file_path)
completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
os.remove(file_path)
if completed.returncode != 0:
return None, completed.stderr
return completed.stdout, None
elif file_type == "txt":
text_content = file_content.read().decode("utf-8")
return text_content, None
@ -332,6 +350,7 @@ def parse_file(file_content, file_type):
except Exception as e:
return None, f"文件解析错误: {str(e)}"
if __name__ == "__main__":
# get_ocr_engine()
app.run(debug=True, port=3401)