fix: parse_file避免内存泄漏

2025-11-28 10:45:17 +08:00 · 2025-11-28 10:45:17 +08:00 · 43a7739c35
parent 1bcf01dd47
commit 43a7739c35
1 changed files with 43 additions and 36 deletions
--- a/cp_app.py
+++ b/cp_app.py
@ -411,55 +411,62 @@ def ask(input:str, p_name:str, stream=False):
 def parse_file(file_content, file_type):
    try:
        if file_type == "pdf":
            # 将文件内容转换为字节流
            pdf_bytes = file_content.read()
            pdf_stream = io.BytesIO(pdf_bytes)
            doc = None
            try:
                doc = fitz.open(stream=pdf_stream, filetype="pdf")
                text_content = ""
            # 首先尝试直接提取文本
                for page_num in range(len(doc)):
                    page = doc[page_num]
                    text_content += page.get_text() + "\n"
                t_plain = text_content.strip()
            doc.close()
                if t_plain:
                    return t_plain, None
                else:
                    return None, "无法直接提取文本，请使用 OCR 处理"
            finally:
                if doc:
                    doc.close()
                pdf_stream.close()  # 显式关闭流
        elif file_type == "docx":
            # 将文件内容转换为字节流
            doc_stream = io.BytesIO(file_content.read())
            try:
                doc = Document(doc_stream)
            # 提取所有段落的文本
                text_content = ""
                for paragraph in doc.paragraphs:
                    text_content += paragraph.text + "\n"
            # 提取表格中的文本
                for table in doc.tables:
                    for row in table.rows:
                        for cell in row.cells:
                            text_content += cell.text + " "
                        text_content += "\n"
                return text_content, None
            finally:
                doc_stream.close()  # 确保流被关闭
        elif file_type == "doc":
            file_name = f'{uuid.uuid4()}.doc'
            file_path = os.path.join(CURRENT_DIR, file_name)
            try:
                file_content.save(file_path)
                completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
            os.remove(file_path)
                if completed.returncode != 0:
                    return None, completed.stderr
                return completed.stdout, None
            finally:
                if os.path.exists(file_path):
                    os.remove(file_path)  # 确保临时文件被删除
        elif file_type == "txt":
            text_content = file_content.read().decode("utf-8")
            return text_content, None
        return None, "不支持的文件类型"
    except Exception as e:
        return None, f"文件解析错误: {str(e)}"