From 43a7739c35aee0aa27040223b01b58385ff901a4 Mon Sep 17 00:00:00 2001
From: caoqianming <caoqianming@foxmail.com>
Date: Fri, 28 Nov 2025 10:45:17 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20parse=5Ffile=E9=81=BF=E5=85=8D=E5=86=85?=
 =?UTF-8?q?=E5=AD=98=E6=B3=84=E6=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cp_app.py | 79 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/cp_app.py b/cp_app.py
index 2935d6f..ee6d658 100644
--- a/cp_app.py
+++ b/cp_app.py
@@ -411,55 +411,62 @@ def ask(input:str, p_name:str, stream=False):
 def parse_file(file_content, file_type):
     try:
         if file_type == "pdf":
-            # 将文件内容转换为字节流
             pdf_bytes = file_content.read()
             pdf_stream = io.BytesIO(pdf_bytes)
-            doc = fitz.open(stream=pdf_stream, filetype="pdf")
-            text_content = ""
-
-            # 首先尝试直接提取文本
-            for page_num in range(len(doc)):
-                page = doc[page_num]
-                text_content += page.get_text() + "\n"
-
-            t_plain = text_content.strip()
-            doc.close()
-            if t_plain:
-                return t_plain, None
-            else:
-                return None, "无法直接提取文本，请使用OCR处理"
+            doc = None
+            try:
+                doc = fitz.open(stream=pdf_stream, filetype="pdf")
+                text_content = ""
+                
+                for page_num in range(len(doc)):
+                    page = doc[page_num]
+                    text_content += page.get_text() + "\n"
+                
+                t_plain = text_content.strip()
+                if t_plain:
+                    return t_plain, None
+                else:
+                    return None, "无法直接提取文本，请使用 OCR 处理"
+            finally:
+                if doc:
+                    doc.close()
+                pdf_stream.close()  # 显式关闭流
 
         elif file_type == "docx":
-            # 将文件内容转换为字节流
             doc_stream = io.BytesIO(file_content.read())
-            doc = Document(doc_stream)
-
-            # 提取所有段落的文本
-            text_content = ""
-            for paragraph in doc.paragraphs:
-                text_content += paragraph.text + "\n"
-
-            # 提取表格中的文本
-            for table in doc.tables:
-                for row in table.rows:
-                    for cell in row.cells:
-                        text_content += cell.text + " "
-                    text_content += "\n"
-            return text_content, None
+            try:
+                doc = Document(doc_stream)
+                text_content = ""
+                
+                for paragraph in doc.paragraphs:
+                    text_content += paragraph.text + "\n"
+                
+                for table in doc.tables:
+                    for row in table.rows:
+                        for cell in row.cells:
+                            text_content += cell.text + " "
+                        text_content += "\n"
+                return text_content, None
+            finally:
+                doc_stream.close()  # 确保流被关闭
 
         elif file_type == "doc":
             file_name = f'{uuid.uuid4()}.doc'
             file_path = os.path.join(CURRENT_DIR, file_name)
-            file_content.save(file_path)
-            completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
-            os.remove(file_path)
-            if completed.returncode != 0:
-                return None, completed.stderr
-            return completed.stdout, None
+            try:
+                file_content.save(file_path)
+                completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
+                if completed.returncode != 0:
+                    return None, completed.stderr
+                return completed.stdout, None
+            finally:
+                if os.path.exists(file_path):
+                    os.remove(file_path)  # 确保临时文件被删除
 
         elif file_type == "txt":
             text_content = file_content.read().decode("utf-8")
             return text_content, None
+            
         return None, "不支持的文件类型"
     except Exception as e:
         return None, f"文件解析错误: {str(e)}"