feat: 采用catdoc支持doc文件
This commit is contained in:
parent
61b3b1f3e5
commit
72dc048f4f
59
cp_app.py
59
cp_app.py
|
|
@ -9,8 +9,15 @@ import os
|
||||||
import re
|
import re
|
||||||
from flask_jwt_extended import JWTManager, create_access_token, jwt_required, get_jwt_identity
|
from flask_jwt_extended import JWTManager, create_access_token, jwt_required, get_jwt_identity
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
import uuid
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
class ParseError(Exception):
|
||||||
|
def __init__(self, msg="请求错误"):
|
||||||
|
self.msg = msg
|
||||||
|
|
||||||
app = Flask(__name__, static_folder='dist/assets', static_url_path='/assets')
|
app = Flask(__name__, static_folder='dist/assets', static_url_path='/assets')
|
||||||
|
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
CORS(app)
|
CORS(app)
|
||||||
VUE_DIST_DIR = os.path.join(os.path.dirname(__file__), 'dist')
|
VUE_DIST_DIR = os.path.join(os.path.dirname(__file__), 'dist')
|
||||||
app.config.update(
|
app.config.update(
|
||||||
|
|
@ -45,6 +52,11 @@ def get_system_info():
|
||||||
def get_s():
|
def get_s():
|
||||||
return get_standard()
|
return get_standard()
|
||||||
|
|
||||||
|
@app.route("/api/check_token/", methods=["GET"])
|
||||||
|
@jwt_required()
|
||||||
|
def check_token():
|
||||||
|
return jsonify(), 200
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
def index():
|
def index():
|
||||||
return send_from_directory(VUE_DIST_DIR, 'index.html')
|
return send_from_directory(VUE_DIST_DIR, 'index.html')
|
||||||
|
|
@ -187,7 +199,10 @@ def cal():
|
||||||
e_content = content
|
e_content = content
|
||||||
e_err_msg = err_msg
|
e_err_msg = err_msg
|
||||||
if content:
|
if content:
|
||||||
res = ask(f'以下内容为用户报告: {content}', "tec")
|
try:
|
||||||
|
res = ask(f'以下内容为用户报告: {content}', "tec")
|
||||||
|
except ParseError as e:
|
||||||
|
return jsonify({"err_msg": e.msg}), 400
|
||||||
if res == "是":
|
if res == "是":
|
||||||
for item in data:
|
for item in data:
|
||||||
if item["firstLevel"] == "二、技术路径(35 分)":
|
if item["firstLevel"] == "二、技术路径(35 分)":
|
||||||
|
|
@ -273,49 +288,43 @@ def ask(input:str, p_name:str, stream=False):
|
||||||
"chat_template_kwargs": {"enable_thinking": False}
|
"chat_template_kwargs": {"enable_thinking": False}
|
||||||
}
|
}
|
||||||
response = requests.post(LLM_URL, headers=HEADERS, json=payload, stream=stream, timeout=(60, 240))
|
response = requests.post(LLM_URL, headers=HEADERS, json=payload, stream=stream, timeout=(60, 240))
|
||||||
print(response.json())
|
|
||||||
if not stream:
|
if not stream:
|
||||||
|
if response.json().get("detail") == "Internal server error":
|
||||||
|
raise ParseError("模型处理错误超过最大token限制")
|
||||||
return response.json()["choices"][0]["message"]["content"]
|
return response.json()["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
def parse_file(file_content, file_type):
|
def parse_file(file_content, file_type):
|
||||||
try:
|
try:
|
||||||
if file_type == "pdf":
|
if file_type == "pdf":
|
||||||
# 将文件内容转换为字节流
|
# 将文件内容转换为字节流
|
||||||
pdf_bytes = file_content.read()
|
pdf_bytes = file_content.read()
|
||||||
pdf_stream = io.BytesIO(pdf_bytes)
|
pdf_stream = io.BytesIO(pdf_bytes)
|
||||||
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
||||||
text_content = ""
|
text_content = ""
|
||||||
|
|
||||||
# 首先尝试直接提取文本
|
# 首先尝试直接提取文本
|
||||||
for page_num in range(len(doc)):
|
for page_num in range(len(doc)):
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
text_content += page.get_text() + "\n"
|
text_content += page.get_text() + "\n"
|
||||||
|
|
||||||
t_plain = text_content.strip()
|
t_plain = text_content.strip()
|
||||||
|
doc.close()
|
||||||
if t_plain:
|
if t_plain:
|
||||||
doc.close()
|
|
||||||
return t_plain, None
|
return t_plain, None
|
||||||
else:
|
else:
|
||||||
# 直接转发字节流
|
|
||||||
# resp = requests.post(OCR_URL,
|
|
||||||
# files={"pdf": (file_content.filename,
|
|
||||||
# pdf_stream,
|
|
||||||
# "application/pdf")},
|
|
||||||
# timeout=120) # 大文件酌情加长
|
|
||||||
# resp.raise_for_status()
|
|
||||||
# return resp.json()["full_text"], None
|
|
||||||
return None, "无法直接提取文本,请使用OCR处理"
|
return None, "无法直接提取文本,请使用OCR处理"
|
||||||
|
|
||||||
elif file_type == "docx":
|
elif file_type == "docx":
|
||||||
# 将文件内容转换为字节流
|
# 将文件内容转换为字节流
|
||||||
doc_stream = io.BytesIO(file_content.read())
|
doc_stream = io.BytesIO(file_content.read())
|
||||||
doc = Document(doc_stream)
|
doc = Document(doc_stream)
|
||||||
|
|
||||||
# 提取所有段落的文本
|
# 提取所有段落的文本
|
||||||
text_content = ""
|
text_content = ""
|
||||||
for paragraph in doc.paragraphs:
|
for paragraph in doc.paragraphs:
|
||||||
text_content += paragraph.text + "\n"
|
text_content += paragraph.text + "\n"
|
||||||
|
|
||||||
# 提取表格中的文本
|
# 提取表格中的文本
|
||||||
for table in doc.tables:
|
for table in doc.tables:
|
||||||
for row in table.rows:
|
for row in table.rows:
|
||||||
|
|
@ -323,8 +332,17 @@ def parse_file(file_content, file_type):
|
||||||
text_content += cell.text + " "
|
text_content += cell.text + " "
|
||||||
text_content += "\n"
|
text_content += "\n"
|
||||||
return text_content, None
|
return text_content, None
|
||||||
|
|
||||||
# 如果需要支持其他文件类型,可以在这里添加处理逻辑
|
elif file_type == "doc":
|
||||||
|
file_name = f'{uuid.uuid4()}.doc'
|
||||||
|
file_path = os.path.join(CURRENT_DIR, file_name)
|
||||||
|
file_content.save(file_path)
|
||||||
|
completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
|
||||||
|
os.remove(file_path)
|
||||||
|
if completed.returncode != 0:
|
||||||
|
return None, completed.stderr
|
||||||
|
return completed.stdout, None
|
||||||
|
|
||||||
elif file_type == "txt":
|
elif file_type == "txt":
|
||||||
text_content = file_content.read().decode("utf-8")
|
text_content = file_content.read().decode("utf-8")
|
||||||
return text_content, None
|
return text_content, None
|
||||||
|
|
@ -332,6 +350,7 @@ def parse_file(file_content, file_type):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None, f"文件解析错误: {str(e)}"
|
return None, f"文件解析错误: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# get_ocr_engine()
|
# get_ocr_engine()
|
||||||
app.run(debug=True, port=3401)
|
app.run(debug=True, port=3401)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue