carbo_server/apps/carbon/service.py

74 lines
2.5 KiB
Python

import io
from docx import Document
import fitz
import subprocess
from rest_framework.exceptions import ParseError
from simhash import Simhash
def parse_file(file_path:str):
file_type = file_path.split('.')[-1].lower()
try:
if file_type == "pdf":
with open(file_path, "rb") as f:
pdf_stream = io.BytesIO(f.read())
doc = fitz.open(stream=pdf_stream, filetype="pdf")
try:
text_content = "\n".join(page.get_text() for page in doc)
t_plain = text_content.strip()
if t_plain:
return t_plain
else:
return ParseError("无法直接提取文本,请使用 OCR 处理")
finally:
doc.close()
elif file_type == "docx":
with open(file_path, "rb") as f:
doc = Document(io.BytesIO(f.read()))
try:
parts = []
# 提取段落
for paragraph in doc.paragraphs:
if paragraph.text.strip(): # 可选:跳过空段落
parts.append(paragraph.text)
# 提取表格
for table in doc.tables:
for row in table.rows:
row_text = " ".join(cell.text.strip() for cell in row.cells)
if row_text: # 可选:跳过空行
parts.append(row_text)
text_content = "\n".join(parts)
return text_content
finally:
pass
elif file_type == "doc":
try:
completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
if completed.returncode != 0:
raise ParseError(completed.stderr)
return completed.stdout
finally:
pass
elif file_type == "txt":
with open(file_path, 'r', encoding='utf-8') as f:
text_content = f.read()
return text_content
raise ParseError("不支持的文件类型")
except Exception as e:
raise ParseError(f"文件解析错误: {str(e)}")
def get_fingerprint(text):
return Simhash(text).value
MASK_64 = (1 << 64) - 1
def hamming_distance(a_u, b_s):
return ((a_u ^ (b_s & MASK_64)) & MASK_64).bit_count()
def split_simhash(fp_int: int):
return (
(fp_int >> 48) & 0xffff,
(fp_int >> 32) & 0xffff,
(fp_int >> 16) & 0xffff,
fp_int & 0xffff,
)