carbo_server/apps/carbon/service.py

import io
from docx import Document
import fitz
import subprocess
from rest_framework.exceptions import ParseError
from simhash import Simhash
def parse_file(file_path:str):
    file_type = file_path.split('.')[-1].lower()
    try:
        if file_type == "pdf":
            with open(file_path, "rb") as f:
                pdf_stream = io.BytesIO(f.read())
            doc = fitz.open(stream=pdf_stream, filetype="pdf")
            try:
                text_content = "\n".join(page.get_text() for page in doc)
                t_plain = text_content.strip()
                if t_plain:
                    return t_plain
                else:
                    return ParseError("无法直接提取文本，请使用 OCR 处理")
            finally:
                doc.close()
        elif file_type == "docx":
            with open(file_path, "rb") as f:
                doc = Document(io.BytesIO(f.read()))
            try:
                parts = []
                # 提取段落
                for paragraph in doc.paragraphs:
                    if paragraph.text.strip():  # 可选：跳过空段落
                        parts.append(paragraph.text)
                # 提取表格
                for table in doc.tables:
                    for row in table.rows:
                        row_text = " ".join(cell.text.strip() for cell in row.cells)
                        if row_text:  # 可选：跳过空行
                            parts.append(row_text)
                text_content = "\n".join(parts)
                return text_content
            finally:
                pass
        elif file_type == "doc":
            try:
                completed = subprocess.run(['catdoc', file_path], capture_output=True, text=True)
                if completed.returncode != 0:
                    raise ParseError(completed.stderr)
                return completed.stdout
            finally:
                pass
        elif file_type == "txt":
            with open(file_path, 'r', encoding='utf-8') as f:
                text_content = f.read()
            return text_content
        raise ParseError("不支持的文件类型")
    except Exception as e:
        raise ParseError(f"文件解析错误: {str(e)}")


def get_fingerprint(text):
    return Simhash(text).value

MASK_64 = (1 << 64) - 1

def hamming_distance(a_u, b_s):
    return ((a_u ^ (b_s & MASK_64)) & MASK_64).bit_count()


def split_simhash(fp_int: int):
    return (
        (fp_int >> 48) & 0xffff,
        (fp_int >> 32) & 0xffff,
        (fp_int >> 16) & 0xffff,
        fp_int & 0xffff,
    )