74 lines
2.5 KiB
Python
74 lines
2.5 KiB
Python
import io
|
|
from docx import Document
|
|
import fitz
|
|
import subprocess
|
|
from rest_framework.exceptions import ParseError
|
|
from simhash import Simhash
|
|
def parse_file(file_path:str):
|
|
file_type = file_path.split('.')[-1].lower()
|
|
try:
|
|
if file_type == "pdf":
|
|
with open(file_path, "rb") as f:
|
|
pdf_stream = io.BytesIO(f.read())
|
|
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
|
try:
|
|
text_content = "\n".join(page.get_text() for page in doc)
|
|
t_plain = text_content.strip()
|
|
if t_plain:
|
|
return t_plain
|
|
else:
|
|
return ParseError("无法直接提取文本,请使用 OCR 处理")
|
|
finally:
|
|
doc.close()
|
|
elif file_type == "docx":
|
|
with open(file_path, "rb") as f:
|
|
doc = Document(io.BytesIO(f.read()))
|
|
try:
|
|
parts = []
|
|
# 提取段落
|
|
for paragraph in doc.paragraphs:
|
|
if paragraph.text.strip(): # 可选:跳过空段落
|
|
parts.append(paragraph.text)
|
|
# 提取表格
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = " ".join(cell.text.strip() for cell in row.cells)
|
|
if row_text: # 可选:跳过空行
|
|
parts.append(row_text)
|
|
text_content = "\n".join(parts)
|
|
return text_content
|
|
finally:
|
|
pass
|
|
elif file_type == "doc":
|
|
try:
|
|
completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
|
|
if completed.returncode != 0:
|
|
raise ParseError(completed.stderr)
|
|
return completed.stdout
|
|
finally:
|
|
pass
|
|
elif file_type == "txt":
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
text_content = f.read()
|
|
return text_content
|
|
raise ParseError("不支持的文件类型")
|
|
except Exception as e:
|
|
raise ParseError(f"文件解析错误: {str(e)}")
|
|
|
|
|
|
def get_fingerprint(text):
|
|
return Simhash(text).value
|
|
|
|
MASK_64 = (1 << 64) - 1
|
|
|
|
def hamming_distance(a_u, b_s):
|
|
return ((a_u ^ (b_s & MASK_64)) & MASK_64).bit_count()
|
|
|
|
|
|
def split_simhash(fp_int: int):
|
|
return (
|
|
(fp_int >> 48) & 0xffff,
|
|
(fp_int >> 32) & 0xffff,
|
|
(fp_int >> 16) & 0xffff,
|
|
fp_int & 0xffff,
|
|
) |