32 lines
1.0 KiB
Python
32 lines
1.0 KiB
Python
from flask import Flask, request, jsonify
|
|
from paddleocr import PaddleOCR
|
|
import fitz
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import numpy as np
|
|
from PIL import Image
|
|
import io
|
|
import paddle
|
|
|
|
app = Flask(__name__)
|
|
paddle.set_device("gpu")
|
|
ocr = PaddleOCR(use_textline_orientation=True, lang="ch") # PaddleOCR初始化
|
|
executor = ThreadPoolExecutor(max_workers=4) # 并行页
|
|
|
|
@app.route("/ocr_full", methods=["POST"])
|
|
def ocr_full():
|
|
pdf_bytes = request.files["pdf"].read()
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
results = list(executor.map(_ocr_page, doc))
|
|
doc.close()
|
|
print(results)
|
|
return jsonify({"full_text": "\n\n".join(results)})
|
|
|
|
def _ocr_page(page: fitz.Page):
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
|
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
|
result = ocr.predict(np.array(img))
|
|
texts = [line[1][0] for line in result[0]]
|
|
return f"--- 第 {page.number+1} 页 ---\n" + " ".join(texts)
|
|
|
|
if __name__ == '__main__':
|
|
app.run(host='127.0.0.1', port=3402, threaded=False) |