初始化

2024-09-24 11:33:54 +08:00 · 2024-09-24 11:33:54 +08:00 · 03c467159c
commit 03c467159c
3 changed files with 208 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 .venv/
 __pycache__/
 app.log
 conf.py
--- a/main.py
+++ b/main.py
@ -0,0 +1,198 @@
 from fastapi import FastAPI, File, UploadFile, Form
 from typing import Literal
 from fastapi.responses import JSONResponse
 from fastapi.exceptions import HTTPException
 from PIL import Image
 from paddleocr import PaddleOCR
 import numpy as np
 import uvicorn
 import re
 import io
 from pdf2image import convert_from_bytes
 import uuid
 import os
 import requests
 import json
 import paddle
 import sys
 from uvicorn.config import LOGGING_CONFIG
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, CUR_DIR)
 import conf
 LOGGING_CONFIG["formatters"]["default"]["fmt"] = "%(asctime)s - %(levelname)s - %(message)s"
 LOGGING_CONFIG["handlers"]["file"] = {
            "class": "logging.handlers.RotatingFileHandler",
            "filename": os.path.join(CUR_DIR, "app.log"),
            "maxBytes": 5 * 1024 * 1024,  # 5 MB
            "backupCount": 5,
            "formatter": "default",
            "encoding": "utf-8"
        }
 LOGGING_CONFIG["loggers"]["uvicorn.error"]["handlers"]=["file"]
 gpu_available  = paddle.device.is_compiled_with_cuda()
 print("GPU available:", gpu_available)
 ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True)
 app = FastAPI()
 ALLOWED_FILE_TYPES = {"application/pdf", "image/jpeg", "image/png", "image/jpg"}
 def save_imgs(images):
    """
    保存图像到本地
    """
    if not os.path.exists('save_imgs'):
        os.makedirs('save_imgs')
    paths = []
    for i, image in enumerate(images):
        image_path = os.path.join('saved_imgs', f'{uuid.uuid4()}.png')
        image.save(image_path)
        paths.append(image_path)
    return paths
 def check_file_format(file: UploadFile):
    if file.content_type not in ALLOWED_FILE_TYPES:
        raise HTTPException(400, "File format not supported.")
 def perform_ocr(ocr, images):
    """
    使用 OCR 提取图像文件中的文字.
    """
    all_text = []
    for image in images:
        # Convert PIL Image to numpy array
        image_np = np.array(image)
        result = ocr.ocr(image_np, cls=True)
        all_text.extend(result)
    return all_text
 def extract_standard_info(text, patterns):
    """
    从标准文件文本中提取信息.
    """
    info = {}
    for label, pattern in patterns:
        if label in ("归口单位", "中文名称"):
            matches = re.findall(pattern, text, re.MULTILINE)
        else:
            matches = re.findall(pattern, text, re.DOTALL)
        if matches:
            if label in ["发布日期", "实施日期"]:
                info[label] = re.sub(r'－', '-', matches[0].replace(' ', ''))
            elif label in ["起草单位", "起草人"]:
                info[label] = matches[0].strip()
            elif label == "提出单位":
                info[label] = matches[-1].strip()
            elif label == "标准号":
                info[label] = matches[0].strip()
            elif label == "发布部门":
                if matches[0][1]:
                    info[label] = matches[0][0] + '、' + matches[0][1]
                else:
                    info[label] = matches[0][0] + matches[0][1]
                info[label] = re.sub(r'\n', '', info[label])
            else:
                info[label] = matches[-1] if matches else None
    return info
 def extract_patent_info(text, patterns):
    """
    从专利文件文本中提取信息.
    """
    info = {}
    for label, pattern in patterns:
        matches = re.findall(pattern, text, re.DOTALL)
        info[label] = matches[-1] if matches else None
    return info
 def clean_info(info: dict):
    for key in info:
        if info[key]:
            info[key] = info[key].replace('\n', '').replace('\r', '').replace('\t', '').strip()
 # 定义标准文件用的正则表达式模式
 standard_patterns = [
    ("标准号", r"[A-Z][A-Z/\ 0-9]{1,}[0-9\.]+[\-—－]\d{3,4}"),  # 仅提取了第一个标准号(若有更多标准号可能错误)
    ("中文名称", r"(^[\u4e00-\u9fa5][\w:：、√—\-\ \nn]+)\n"  # 定位英文名称前的中文名称
                    r"(?:[^\u4e00-\u9fa5]+)\n"  # 定位发布日期前的英文名称部分
                    r"(?:\d{4}(?:-|－)\d{2}(?:-|－)\d{2}) *发布"),  # 定位发布日期
    ("英文名称", r"\n([^\u4e00-\u9fa5]+)\n(?:\d{4}(?:-|－)\d{2}(?:-|－)\d{2}) *发布"),  # 匹配发布日期前面的非中文部分
    ("发布部门", r"([\u4e00-\u9fa5]+)\n发布\n([\u4e00-\u9fa5\n]*)"),  # 发布部门是通过匹配发布的上下行的中文
    ("发布日期", r"(\d{4}(?:-|－)\d{2}(?:-|－)\d{2}) *发布"),
    ("实施日期", r"(\d{4}(?:-|－)\d{2}(?:-|－)\d{2}) *实施"),
    ("提出单位", r"由(.+?)\s?提出。"),
    ("归口单位", r"由(.+?)\s?归口。"),
    ("起草单位", r"起草单位[：:](.+?)(?:。|起草人)"),
    ("起草人", r"起草人[：:](.+?)(?:。)")
 ]
 # 定义专利文件用的正则表达式模式
 patent_patterns = [
    ("专利名称", r"名称[：:](.+?)\n"),
    ("发明人", r"发[\s]*明[\s]*人[\s]*[：:](.+?)\n"),
    ("专利号", r"专[\s]*利[\s]*号[\s]*[：:](.+?)\n"),
    ("专利申请日", r"专利申请日[：:](.+?)\n"),
    ("专利权人", r"专[\s]*利[\s]*权[\s]*人[\s]*[：:](.+?)\n"),
    ("地址", r"地[\s]*址[\s]*[：:](.+?)\n"),
    ("授权公告日", r"授权公告日：(.+?)\n"),
    ("授权公告号", r"授权公告号：(.+?)\n")
 ]
@app.get("/error")
 async def create_error():
    raise ValueError("This is a test error!")
@app.post("/extract")
 async def extract_info(
    file_type: Literal["standard", "patent"] = Form(..., description="Specify the type of file to extract."), 
    extract_method: Literal["re", "chat"] = Form(..., description="Specify how to extract. 正则或对话模型"),
    file: UploadFile = File(..., description="Upload a PDF, JPEG, or PNG file.")
 ):
    check_file_format(file)
    content = await file.read()  # 读取文件内容
    if file.filename.lower().endswith('.pdf'):
        images = convert_from_bytes(content, first_page=1, last_page=6, dpi=300)
    else:
        images = [Image.open(io.BytesIO(content))]
    result = perform_ocr(ocr, images)
    ocr_text = "\n".join([line[1][0] for res in result if res is not None for line in res])
    # 提取信息
    if extract_method == "re":
        if file_type == "patent":
            info = extract_patent_info(ocr_text, patent_patterns)
            clean_info(info)
        elif file_type == "standard":
            info = extract_standard_info(ocr_text, standard_patterns)
            clean_info(info)
        else:
            raise HTTPException(400, detail="Invalid file type. Please choose 'standard' or 'patent'.")
    else:
        if file_type == 'patent':
            prompt = f'我有以下文本,是一个专利的内容。请按专利名称，发明人，专利号，专利申请日， 专利权人，授权公告号，授权公告日为key的json格式返回数据,注意只返回json数据。文本如下：{ocr_text}'
        elif file_type == "standard":
            prompt = f'我有以下文本,是一个标准的内容。请按标准号，中文名称，英文名称，发布部门，发布日期，实施日期，提出单位，归口单位，起草单位，起草人为key的json格式返回数据,注意只返回json数据。文本如下：{ocr_text}'
        else:
            raise HTTPException(400, detail="Invalid file type. Please choose 'standard' or 'patent'.")
        r = requests.post(conf.CHAT_API, json={
            "model": "llama3.1",
            "prompt": prompt,
            "stream": False
        })
        info = json.loads((r.json()['response']))
    return JSONResponse(content=info)
 if __name__ == "__main__":
    uvicorn.run(app="main:app", host="0.0.0.0", port=8000, reload=True, log_config=LOGGING_CONFIG)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
 fastapi
 uvicorn
 python-multipart
 paddlepaddle
 paddleocr
 pdf2image