import io from docx import Document import fitz import subprocess from rest_framework.exceptions import ParseError from simhash import Simhash def parse_file(file_path:str): file_type = file_path.split('.')[-1].lower() try: if file_type == "pdf": with open(file_path, "rb") as f: pdf_stream = io.BytesIO(f.read()) doc = fitz.open(stream=pdf_stream, filetype="pdf") try: text_content = "\n".join(page.get_text() for page in doc) t_plain = text_content.strip() if t_plain: return t_plain else: return ParseError("无法直接提取文本,请使用 OCR 处理") finally: doc.close() elif file_type == "docx": with open(file_path, "rb") as f: doc = Document(io.BytesIO(f.read())) try: parts = [] # 提取段落 for paragraph in doc.paragraphs: if paragraph.text.strip(): # 可选:跳过空段落 parts.append(paragraph.text) # 提取表格 for table in doc.tables: for row in table.rows: row_text = " ".join(cell.text.strip() for cell in row.cells) if row_text: # 可选:跳过空行 parts.append(row_text) text_content = "\n".join(parts) return text_content finally: pass elif file_type == "doc": try: completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True) if completed.returncode != 0: raise ParseError(completed.stderr) return completed.stdout finally: pass elif file_type == "txt": with open(file_path, 'r', encoding='utf-8') as f: text_content = f.read() return text_content raise ParseError("不支持的文件类型") except Exception as e: raise ParseError(f"文件解析错误: {str(e)}") def get_fingerprint(text): return Simhash(text).value MASK_64 = (1 << 64) - 1 def hamming_distance(a_u, b_s): return ((a_u ^ (b_s & MASK_64)) & MASK_64).bit_count() def split_simhash(fp_int: int): return ( (fp_int >> 48) & 0xffff, (fp_int >> 32) & 0xffff, (fp_int >> 16) & 0xffff, fp_int & 0xffff, )