fix: parse_file避免内存泄漏
This commit is contained in:
parent
1bcf01dd47
commit
43a7739c35
23
cp_app.py
23
cp_app.py
|
|
@ -411,55 +411,62 @@ def ask(input:str, p_name:str, stream=False):
|
||||||
def parse_file(file_content, file_type):
|
def parse_file(file_content, file_type):
|
||||||
try:
|
try:
|
||||||
if file_type == "pdf":
|
if file_type == "pdf":
|
||||||
# 将文件内容转换为字节流
|
|
||||||
pdf_bytes = file_content.read()
|
pdf_bytes = file_content.read()
|
||||||
pdf_stream = io.BytesIO(pdf_bytes)
|
pdf_stream = io.BytesIO(pdf_bytes)
|
||||||
|
doc = None
|
||||||
|
try:
|
||||||
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
||||||
text_content = ""
|
text_content = ""
|
||||||
|
|
||||||
# 首先尝试直接提取文本
|
|
||||||
for page_num in range(len(doc)):
|
for page_num in range(len(doc)):
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
text_content += page.get_text() + "\n"
|
text_content += page.get_text() + "\n"
|
||||||
|
|
||||||
t_plain = text_content.strip()
|
t_plain = text_content.strip()
|
||||||
doc.close()
|
|
||||||
if t_plain:
|
if t_plain:
|
||||||
return t_plain, None
|
return t_plain, None
|
||||||
else:
|
else:
|
||||||
return None, "无法直接提取文本,请使用 OCR 处理"
|
return None, "无法直接提取文本,请使用 OCR 处理"
|
||||||
|
finally:
|
||||||
|
if doc:
|
||||||
|
doc.close()
|
||||||
|
pdf_stream.close() # 显式关闭流
|
||||||
|
|
||||||
elif file_type == "docx":
|
elif file_type == "docx":
|
||||||
# 将文件内容转换为字节流
|
|
||||||
doc_stream = io.BytesIO(file_content.read())
|
doc_stream = io.BytesIO(file_content.read())
|
||||||
|
try:
|
||||||
doc = Document(doc_stream)
|
doc = Document(doc_stream)
|
||||||
|
|
||||||
# 提取所有段落的文本
|
|
||||||
text_content = ""
|
text_content = ""
|
||||||
|
|
||||||
for paragraph in doc.paragraphs:
|
for paragraph in doc.paragraphs:
|
||||||
text_content += paragraph.text + "\n"
|
text_content += paragraph.text + "\n"
|
||||||
|
|
||||||
# 提取表格中的文本
|
|
||||||
for table in doc.tables:
|
for table in doc.tables:
|
||||||
for row in table.rows:
|
for row in table.rows:
|
||||||
for cell in row.cells:
|
for cell in row.cells:
|
||||||
text_content += cell.text + " "
|
text_content += cell.text + " "
|
||||||
text_content += "\n"
|
text_content += "\n"
|
||||||
return text_content, None
|
return text_content, None
|
||||||
|
finally:
|
||||||
|
doc_stream.close() # 确保流被关闭
|
||||||
|
|
||||||
elif file_type == "doc":
|
elif file_type == "doc":
|
||||||
file_name = f'{uuid.uuid4()}.doc'
|
file_name = f'{uuid.uuid4()}.doc'
|
||||||
file_path = os.path.join(CURRENT_DIR, file_name)
|
file_path = os.path.join(CURRENT_DIR, file_name)
|
||||||
|
try:
|
||||||
file_content.save(file_path)
|
file_content.save(file_path)
|
||||||
completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
|
completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
|
||||||
os.remove(file_path)
|
|
||||||
if completed.returncode != 0:
|
if completed.returncode != 0:
|
||||||
return None, completed.stderr
|
return None, completed.stderr
|
||||||
return completed.stdout, None
|
return completed.stdout, None
|
||||||
|
finally:
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
os.remove(file_path) # 确保临时文件被删除
|
||||||
|
|
||||||
elif file_type == "txt":
|
elif file_type == "txt":
|
||||||
text_content = file_content.read().decode("utf-8")
|
text_content = file_content.read().decode("utf-8")
|
||||||
return text_content, None
|
return text_content, None
|
||||||
|
|
||||||
return None, "不支持的文件类型"
|
return None, "不支持的文件类型"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None, f"文件解析错误: {str(e)}"
|
return None, f"文件解析错误: {str(e)}"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue