From 43a7739c35aee0aa27040223b01b58385ff901a4 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Fri, 28 Nov 2025 10:45:17 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20parse=5Ffile=E9=81=BF=E5=85=8D=E5=86=85?= =?UTF-8?q?=E5=AD=98=E6=B3=84=E6=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cp_app.py | 79 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/cp_app.py b/cp_app.py index 2935d6f..ee6d658 100644 --- a/cp_app.py +++ b/cp_app.py @@ -411,55 +411,62 @@ def ask(input:str, p_name:str, stream=False): def parse_file(file_content, file_type): try: if file_type == "pdf": - # 将文件内容转换为字节流 pdf_bytes = file_content.read() pdf_stream = io.BytesIO(pdf_bytes) - doc = fitz.open(stream=pdf_stream, filetype="pdf") - text_content = "" - - # 首先尝试直接提取文本 - for page_num in range(len(doc)): - page = doc[page_num] - text_content += page.get_text() + "\n" - - t_plain = text_content.strip() - doc.close() - if t_plain: - return t_plain, None - else: - return None, "无法直接提取文本,请使用OCR处理" + doc = None + try: + doc = fitz.open(stream=pdf_stream, filetype="pdf") + text_content = "" + + for page_num in range(len(doc)): + page = doc[page_num] + text_content += page.get_text() + "\n" + + t_plain = text_content.strip() + if t_plain: + return t_plain, None + else: + return None, "无法直接提取文本,请使用 OCR 处理" + finally: + if doc: + doc.close() + pdf_stream.close() # 显式关闭流 elif file_type == "docx": - # 将文件内容转换为字节流 doc_stream = io.BytesIO(file_content.read()) - doc = Document(doc_stream) - - # 提取所有段落的文本 - text_content = "" - for paragraph in doc.paragraphs: - text_content += paragraph.text + "\n" - - # 提取表格中的文本 - for table in doc.tables: - for row in table.rows: - for cell in row.cells: - text_content += cell.text + " " - text_content += "\n" - return text_content, None + try: + doc = Document(doc_stream) + text_content = "" + + for paragraph in doc.paragraphs: + text_content += paragraph.text + "\n" + + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + text_content += cell.text + " " + text_content += "\n" + return text_content, None + finally: + doc_stream.close() # 确保流被关闭 elif file_type == "doc": file_name = f'{uuid.uuid4()}.doc' file_path = os.path.join(CURRENT_DIR, file_name) - file_content.save(file_path) - completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True) - os.remove(file_path) - if completed.returncode != 0: - return None, completed.stderr - return completed.stdout, None + try: + file_content.save(file_path) + completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True) + if completed.returncode != 0: + return None, completed.stderr + return completed.stdout, None + finally: + if os.path.exists(file_path): + os.remove(file_path) # 确保临时文件被删除 elif file_type == "txt": text_content = file_content.read().decode("utf-8") return text_content, None + return None, "不支持的文件类型" except Exception as e: return None, f"文件解析错误: {str(e)}"