cma_search/server/utils/pdf2txt.py

386 lines
22 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Author: tianyang.zhang
Date: 2024-04-25
param: {pdf_file:输入PDF文件路径excel_file:输入Excel文件路径}
description: 提取PDF文件中的文字信息并将其保存到Excel文件中,正则表达式提取信息。
"""
import re
import fitz # PyMuPDF
import re
from openpyxl import load_workbook
from apps.information.models import Parsepdf
RE_LIST = [
[r"法定代表人(单位负责人):(.*?)\n", "C4"],
[r"法定代表人(单位负责人):(.*?)\n", "C22"],
[r"本机构名称:(.*?)\n","C5"],
[r"单位注册地址及行政区划 (.*?)\n","C8"],
[r" 单位所在地址及行政区划 (.*?)\n","C9"],
[r"执行会计制度:(.*?)\n","C7"],
[r"单位类型\(01-1\) (.*?)\n", "C16"],
[r"法人单位名称\(02-1\)(.*?)\n", "C17"],
[r"4\)(.*?)\n", "F16"],
[r"法人单位行业分类代码(四位数字):(.*?)\n", "F17"],
[r"固定电话:(.*?)\n", "F24"],
[r"传真号码:(.*?)\n", "F25"],
[r"电子邮箱:(.*?)\n", "F26"],
[r"登记注册类型:(.*?)\n", "F28"],
[r"所属管辖区:(.*?)\n", "C27"],
[r"机构类型:(.*?)\n", "C29"],
[r"是否由事业单位转企改制而来:(.*?)\n", "F29"],
[r"企业控股情况:(.*?)\n", "C32"],
[r"营业状态:(.*?)\n", "C33" ],
[r"是否高新技术企业认定:(.*?)\n", "C35"],
[r"认定或复审高新技术企业证书编号:(.*?)\n", "C36"],
[r"高新技术企业的日期:(.*?)\n", "F36"],
[r"是否境内上市和在新三板挂牌?(.*?)\n", "C38"],
[r"上市挂牌时间:(.*?)\n", "H39"],
[r"保费支出(\d+\.\d+?)万元\n", "F41"],
[r"检验检测机构责\n任险\n(.*?)\n", "C41"],
[r"检验检测机构责\n任险\n(.*?)\n", "C41"],
[r"检验检测人员职\n业责任险\n(.*?)\n", "C42"],
[r"保费支出(\d+\.\d+?)万元\n", "F42"],
[r"企业集团:(.+?)\n", "C30"],
[r"领域一(.+?)\n", "C52"],
[r"领域二(.+?)\n", "C53"],
[r"领域三(.+?)\n", "C54"],
[r"领域四(.+?)\n", "C55"],
[r"领域五(.+?)\n", "C56"],
[r"其他领域描述:(.+?)\n", "C58"],
[r"检验检测服务关键词:(.*?)\n", "C60"],
[r"本检验检测机构当年接受各类技术评审:(.*?)次", "E85"],
[r"接受资质认定技术\n评审:\n(.*?)次", "E86"],
[r"管部门组织检查:\n(.*?)次", "E87"],
[r"技术监督部门(市\n场监管部门组织\n评审:\n(.*?)次", "E88"],
[r"接受行业管理部门\n组织技术评审:\n(.*?)次", "E89"],
[r"其中国家行业管\n理部门组织评审\n(.+?)次", "E90"],
[r"本检验检测机构当年接受各类监督检查(.*?)项", "E94"],
[r"接受资质认定专项监督检查:\n(.*?)项", "E95"],
[r"其中国家市场监管部门组织检查:\n(.*?)项", "E96"],
[r"省级及以下质量技术监督部门(市场监管部门)\n组织检查:\n(.*?)项", "E97"],
[r"接受行业管理部门监督检查:\n(.*?)项", "E98"],
[r"其中国家行业管理部门组织检查:\n(.*?)项", "E99"],
[r"省级及以下行业管理部门组织检查:\n(.*?)项", "E100"],
[r"接受其他社会组织、团体及境内外评价机构监督检\n查:\n(.*?)项", "E101"],
[r"本单位是否处于国家检验检测认证公共服务平台示范区:(.*?)\n", "E102"],
[r"本单位处于工业园区(开发区):(.*?)\n", "E103"],
[r"是否加入国内外产业联盟:(.*?)\n", "C104"],
[r"产业联盟1名称:(.*?)\n", "C105"],
[r"产业联盟2名称:(.*?)\n", "E105"],
[r"产业联盟3名称:(.*?)\n", "C106"],
[r"产业联盟4名称:(.*?)\n", "E106"],
[r"是否加入国内外行业协会组织:(.*?)\n", "E107"],
[r"存货\n01\n(.*?)\n", "D111"],
[r"固定资产原值\(原价\)\n02\n(.*?)\n", "D112"],
[r"累计折旧\n03\n(.*?)\n", "D113"],
[r"其中,本年折旧\n04\n(.*?)\n", "D114"],
[r"资产总计\n05\n(.*?)\n", "D115"],
[r"负债合计\n06\n(.*?)\n", "D116"],
[r"营业收入\n07\n(.*?)\n", "D117"],
[r"营业成本\n08\n(.*?)\n", "D118"],
[r"营业税金及附加\n09\n(.*?)\n", "D119"],
[r"销售费用\n10\n(.*?)\n", "D120"],
[r"管理费用\n11\n(.*?)\n", "D121"],
[r"其中,税金\n12\n(.*?)\n", "D122"],
[r"差旅费\n13\n(.*?)\n", "D123"],
[r"财务费用\n14\n(.*?)\n", "D124"],
[r"其中,利息净支出\n15\n(.*?)\n", "D125"],
[r"资产减值损失\n16\n(.*?)\n", "D126"],
[r"公允价值变动收益\n17\n(.*?)\n", "D127"],
[r"投资收益\n18\n(.*?)\n", "D128"],
[r"其他收益\n19\n(.*?)\n", "D129"],
[r"营业利润\n20\n(.*?)\n", "D130"],
[r"营业外收入\n21\n(.*?)\n", "D131"],
[r"营业外支出\n23\n(.*?)\n", "D132"],
[r"营业外支出\n23\n(.*?)\n", "D132"],
[r"利润总额\n24\n(.*?)\n", "D133"],
[r"净利润\n25\n(.*?)\n", "D134"],
[r"所得税费用\n26\n(.*?)\n", "D135"],
[r"应付职工薪酬\(本期贷方\n累计发生额\\n27\n(.*?)\n", "D136"],
[r"本年应交增值税\(本期累\n计发生额\\n28\n(.*?)\n", "D137"],
[r"检验检测业务活动类型\(可多选\\(00-1\)(.*?)\n", "C140"],
[r"检验检测业务活动特点\(00-2\)(.*?)\n", "C141"],
[r"检验检测报告数合计:\n(.*?)份", "F144"],
[r"合格的报告数:(.*?)份\n", "F145"],
[r"其中,为省\(自治区、直辖市\)外出具\n检验检测报告数\n(.*?)份", "F146"],
[r"行政执法或政府委托检验检测\n报告份数(.*?)份", "F147"],
[r"收入\s(\d+\.\d+)万元\(01-101-2\)\n", "I147"],
[r"其中,当年承担产品质量国家监督抽查\n工作出具检验检测报告数:\n(.*?)份", "F148"],
[r"其中,当年承担产品质量地方监督抽查\n工作出具检验检测报告数:\n(.*?)份", "F149"],
[r"其中当年承担3C强制性认证检验检\n测报告数:\n(.*?)份", "F150"],
[r"其中,当年承担生产许可证检验检测报\n告数数:\n(.*?)份", "F151"],
[r"社会委托检验检测\n报告份数(.*?)份", "F167"],
[r"\(01-102-1\) 收入 (\d+\.\d+) 万元\(01-102-2\)\n", "I167"],
[r"司法鉴定、仲裁检验检测\n报告份数(.*?)份","F170"],
[r"01-103-1\)\n收入 (\d+\.\d+) 万元\(01-103-2\)\n","I170"],
[r"其他技术服务\n报告份数 (\d+) 份","F172"],
[r"01-104-1\)\n收入 (\d+\.\d+) 万元\(01-104-2\)\n", "I172"],
[r"是否服务制造业企业\(选是,继续填报以下\n内容\\n(.*?)\n", "E174"],
[r"服务制造业企业的业务收入占总收入:\n(.*?)%", "E175"],
[r"服务制造业企业的业务收入较去年\n增长:\n(.*?)%", "H175"],
[r"是否服务个人消费者\(选是,继续填报以下\n内容\\n(.*?)\n", "H176"],
[r"服务个人消费者的业务收入占总收入:\n(\d+\.\d+)%", "E177"],
[r"服务个人消费者的业务收入较去年年\n增长:\n(\d+\.\d+)%", "H177"],
[r"是否为高技术产业(制造业)提供检验检测服\n务\n(.*?)\n", "E178"],
[r"是否为高技术产业\(制造业\)提供检验检测服\n务\n(.*?)\n", "E187"],
[r"新一代信息技术产业,业务比重:\n(.*?)%", "E190"],
[r"高端装备,业务比重:\n(.*?)%", "E191"],
[r"新材料,业务比重:\n(.*?)%", "E192"],
[r"生物产业,业务比重:\n(.*?)%", "E193"],
[r"新能源汽车,业务比重:\n(.*?)%", "E194"],
[r"新能源产业,业务比重:\n(.*?)%", "E195"],
[r"节能环保,业务比重:\n(.*?)%", "E196"],
[r"全部仪器设备\n(.*?)台套", "D200"],
[r"其中50万元\n以上仪器设备\n(.*?)台套", "D201"],
[r"其中,进口仪\n器设备\n(.*?) 台套", "D205"],
[r"全部仪器设备资\n产原值\n(\d+\.\d+) 万元", "D206"],
[r"其中50万元\n以上仪器设备资产\n原值\n(\d+\.\d+) 万元", "D207"],
[r"50-100万元\n仪器设备资产\(在\n用\)原值\n(\d+\.\d+) 万元", "D208"],
[r"其中,进口仪\n器设备资产原值\n(\d+\.\d+) 万元", "D217"],
[r"与检验检测相关\n的固定资产原值\n\(设备\\n(\d+\.\d+) 万元", "D218"],
[r"当年新增仪器设\n备\n(.*?)台套", "D219"],
[r"其中,当年新\n增50万元以上仪器\n设备\(设备\\n(.*?)台套", "D220"],
[r" 其中50-\n100万元仪器设备\n(.*?)台套", "D221"],
[r" 其中200\n万以上仪器设备\n(.*?)台套", "D223"],
[r"当年新增仪器设\n备原值合计\n(.*?)万元", "D224"],
[r"其中,当年新增\n50万元以上仪器设\n备资产原值\n(.*?)万元", "D225"],
[r"其中,50-100\n万元仪器设备资产\n原值\n(.*?)万元", "D226"],
[r"其中,100-\n200万仪器设备资产\n原值\n(.*?)万元", "D227"],
[r"其中,200万\n元以上仪器设备资\n产原值\n(.*?)万元", "D228"],
[r"机构总面积\n(.*?)平方米", "D231"],
[r"其中办公面积(.*?)平方米", "D232"],
[r"实验室面积\n(.*?)平方米", "D233"],
[r"其中,恒温\n恒湿实验室\n(.*?)平方米", "D234"],
[r"其中P2\n以上生物安全实验\n室\n(.*?)平方米", "D235"],
[r"其中,二恶\n英实验室\n(.*?)平方米", "D236"],
[r"其中,电磁\n屏蔽实验室\n(.*?)平方米", "D237"],
[r"其中,消声\n实验室\n(.*?)平方米", "D238"],
[r"其中,放射\n性实验室\n(.*?)平方米", "D239"],
[r" 其中,动物\n房\n(.*?)平方米", "D240"],
[r"专用室外试验\n场\n(.*?)平方米", "D241"],
[r"参数(.*?)项", "C244"],
[r"产品标准(.*?)项", "E244"],
[r"方法标准(.*?)项", "G244"],
[r"检验检测从业人员期末人数(.*?)人", "D246"],
[r"其中:研究生及\n以上学历\(03-401\)(.*?)人", "D247"],
[r"大学本科\n学历\(03-402\)\n(.*?)人", "D248"],
[r"专科及以\n下学历\(03-403\)\n(.*?)人", "D249"],
[r"其中:高级技术\n职称人员\(03-404\)\n(.*?)人", "D250"],
[r"中级技术\n职称人员\(03-405\)\n(.*?)人", "D251"],
[r"初级技术\n职称人员\(03-406\)\n(.*?)人", "D252"],
[r"具备中级\n技术职称同等水平\n的技术能力人员\n(.*?)人", "D253"],
[r"其他\(03-\n407\)\n(.*?)人", "D254"],
[r"其中:授权签字人\n\(03-408\)\n(.*?)人", "D255"],
[r"管理人员\n\(03-409\)\n(.*?)人", "D256"],
[r"检验检测\n\技术人员\(03-410\)\n(.*?)人", "D257"],
[r"其中:两院院士\n\(03-411\)\n(.*?)人", "D259"],
[r"选人员\(03-413\)\n(.*?)人", "D260"],
[r"其他:\n(.*?)研发活动及相关\(03-5\)\n","D261"],
[r"当年专利申请受\n理数\n(.*?)件", "D264"],
[r"其中:当年发明\n专利申请受理数\n(.*?)件", "D265"],
[r"其中:申请欧美\n日专利\n(.*?)件", "D266"],
[r"其中:申请PCT\n国际专利\n(.*?)件", "D267"],
[r"当年专利授权书\n数\n(.*?)件", "D268"],
[r"其中,当年发明\n专利授权数\n(.*?)件", "D269"],
[r"其中:授权欧美\n日专利\n(.*?)件", "D270"],
[r"期末有效专利数\n(.*?)件", "D271"],
[r"其中:期末有效\n发明专利数\n(.*?)件", "D272"],
[r"其中:拥有境外\n授权专利\n(.*?)件", "D273"],
[r"期末拥有注册商\n标\n(.*?)件", "D274"],
[r"其中:当年注\n册商标\n(.*?)件", "D275"],
[r"其中:境外注\n册商标\n(.*?)件", "D276"],
[r"其中:驰名商\n标\n(.*?)件", "D277"],
[r"马德里商标国\n际注册申请量\n(.*?)件", "D278"],
[r"拥有软件著作权\n(.*?)件", "D279"],
[r"其中:当年获\n得软件著作权\n(.*?)件", "D280"],
[r"是否获得本年度\n国务院国家科学技\n术奖\n(.*?)\n", "E282"],
[r"市人民政府设立的\n省级科学技术奖\n(.*?)\n", "E284"],
[r" 其他\n(.*?)04\n","E287"],
[r"主要服务地域\(04-1\)(.*?)\n", "D289"],
[r"主要客户类型\(可多选\\(04-2\)(.*?)\n", "D290"],
[r"\s+科研项目总计(.*?)项", "D293"],
[r"其中,国家级项目(.*?)项", "D294"],
[r"其中,省部级项目(.*?)项", "D295"],
[r"科研经费总计(.*?)万元", "D296"],
[r"其中,国家级项目(.*?)万元", "D297"],
[r"其中,省部级项目(.*?)万元", "D298"],
[r"标准制修订经费总计(.*?)万元", "D300"],
[r"其中,国家标准(.*?)项", "D301"],
[r"其中,行业标准(.*?)项", "D302"],
[r"其中,地方标准(.*?)项", "D303"],
[r"其中,国际标准(.*?)项", "D304"],
[r"其中,国际标准(.*?)项", "D304"],
[r"本机构人员是否在认证认可、检验检测相关领域国际标准化组织任职(.*?)\(05-202\)", "D305"],
[r"人员姓名:(.*?)\(05-203-1\)", "C306"],
[r"担任职务:(.*?)\(05-203-2\)", "C307"],
[r"本年度参加能力\n验证计划合计\n(.*?)项", "D313"],
[r"其中,国家级\n能力验证项目\n(.*?)项", "D314"],
[r"市场监管总局\n\(国家认监委\)能\n力验证项目\n(.*?)项", "D315"],
[r"国家有关行业\n主管部门能力验证\n项目\n(.*?)项", "D316"],
[r"省级能力验证\n项目\n(.*?)项", "D317"],
[r"国内能力验证\n提供者项目\n(.*?)项", "D318"],
[r"国际能力验证\n提供者和国家相关\n组织项目\n(.*?)项", "D319"],
[r"其他能力验证\n项目\n(.*?)项", "D320"],
[r"参加测量审核合\n计\n(.*?)项", "D321"],
[r"机构本年度是否发生变更(.*?)\(07-1\)", "D323"],
[r"是否工业和信息化部认定的“工业产品质量控制和技术评价实验室”(.*?)\n", "D344"],
[r"是否工业和信息化部认定的“工业产品质量控制和技术评价实验室”(.*?)\n", "D344"],
[r"实验室名称:(.*?)所属行业", "C345"],
[r"所属行业:(.*?)授牌","E345"],
[r"授牌\n年份:(.*?)\n","G345"],
[r"是否通过互联网开展检验检测业务\?(.*?)\n","D347"],
[r"单位负责人:(.*?)\n","C351"],
[r"财务负责人:(.*?)\n","E351"],
[r"填表人:(.*?)\n","H351"],
[r"单位负责人电话:(.*?)\n","C352"],
[r"财务负责人电话:(.*?)\n","E352"],
[r"填表人电话:(.*?)\n","H352"],
[r"资质认定联系人座\n机(.*?)资质认定联系人手\n","C353"],
[r"资质认定联系人手\n机(.*?)\n","E353"],
[r"资质认定联系人邮\n箱(.*?)\n","H353"],
[r"资质认定联系人姓\n名(.*?)\n","C354"],
]
SPECIALLIST = [
[r"开业\(成立\)时间:", "F22", 11],
[r"是否含有外资:", "C28", 2],
[r"执行企业会计准则情况:", "F31", 15],
[r"境外认可机构颁发证书:", "E68", 4],
[r"质检中\n心:", "C82", 4],
[r"组织评审:;(.*?)", "E91", 4],
[r"接受认可机构评审:(.*?)", "E92", 4],
[r"评价机构评审:", "E93", 4],
[r"行业协会1名称:", "C108", 11],
[r"行业协会2名称:", "E108", 11],
[r"其中100-\n200万仪器设备\n", "D222", 2],
[r"是否愿意将仪器\n设备对外共享\n", "D229", 2],
]
COMMON_KEY = [
[r"其他地址:(.*?)\n", ["C10", "C11", "C12"]],
[r"闲置仪器设\n备原值\n (\d+\.\d+) 万元", ["D209","D212", "D215"]]
]
SPECIALLIST_2 = [
[r"长途区\n(.*?)\n", "C24"],
[r"移动电\n(.*?)\n", "C25"],
[r"邮政编\n(.*?)\n", "C26"],
]
SPECIALLIST_3 = [
[r"\n入\n占\n比\n(.*?)\n", ["E52", "E53", "E54", "E55", "E56"]],
[r"\n务\n饱\n和\n度\n(.*?)\n", ["H52", "H53", "H54", "H55", "H56"]],
[r"构资质认定证书:\n(.*?)\n", ["E61"]],
[r"颁发的资质认定证书:\n(.*?)\n", ["E62"]],
[r"国家市场监督管理部门颁发的特种设备检验检测机构核准证:\n(.*?)\n", ["E63"]],
[r"省级市场监督管理部门颁发的特种设备检验检测机构核准证:\n(.*?)\n", ["E64"]],
[r"资质、资格证书:\n(.*?)\n", ["E65"]],
[r"中国合格评定国家认可中心颁发的:\n(.*?)\n", ["E66"]],
[r"其他社会组织、团体颁发的证书:\n(.*?)\n", ["E67"]],
[r"\s+其中,个人委托检验检测报告:\n(.*?)\n", ["F168"]],
["其中,单位委托检验检测报告:\n(.*?)\n", ["F169"]],
[r"其中,出具司法鉴定意见书:\n(.*?)\n",["F171"]],
]
SPEMANY = [
[r"50-100万元\n仪器设备在用\n(.*?)台套 ,闲置 (.*?) 台套 ,待报废 (.*?) 台套", ["D202", "G202","I202"]],
[r"100-200万仪\n器设备在用\n(.*?)台套 ,闲置 (.*?) 台套 ,待报废 (.*?) 台套", ["D203", "G203","I203"]],
[r" 200万元以上\n仪器设备在用\n(.*?)台套 ,闲置 (.*?) 台套 ,待报废 (.*?) 台套", ["D204", "G204","I204"]],
[r"50-100万元\n仪器设备资产\(在\n用\)原值\n(\d+\.\d+) 万元\n\s*闲置仪器设\n备原值\n(\d+\.\d+) 万元\n\s*待报废仪器\n设备原值\n(\d+\.\d+) 万元", ["D208", "D209", "D210"]],
[r"100-200万仪\n器设备资产\(在\n用\)原值\n(\d+\.\d+) 万元\n\s*闲置仪器设\n备原值\n(\d+\.\d+) 万元\n\s*待报废仪器\n设备原值\n(\d+\.\d+) 万元", ["D211", "D212", "D213"]],
[r"200万元以上\n仪器设备\(在用\\n原值\n(\d+\.\d+) 万元\n\s*闲置仪器设\n备原值\n(\d+\.\d+) 万元\n\s*待报废仪器\n设备原值\n(\d+\.\d+) 万元", ["D214", "D215", "D216"]],
]
# 从PDF中提取文本
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
# 使用正则表达式匹配文本
def match_text_with_regex(text, pattern, num:int):
matches = re.search(pattern, text, re.DOTALL)
if matches:
results = matches.group(num).strip() if matches else None
return results
# 使用正则表达式匹配文本
def match_text_with_all(text, pattern):
matches = re.finditer(pattern, text)
results = [i.group().strip() if i else None for i in matches]
return results
def match_text_with_group(text, pattern):
matches = re.finditer(pattern, text)
results = [i.group(1).strip()[:-1] if i else None for i in matches]
return results
def match_many_res(text, pattern):
matches = re.findall(pattern, text)
if matches:
results = [i if i else None for i in matches[0]]
return results
else:
return ''
def match_text_with_match(text, pattern):
matches = re.search(pattern, text)
results = matches.group().strip() if matches else None
new_results = results.split(":")[1]
return new_results
# 将匹配结果填入Excel
def fill_excel(matches, EXCEL_PATH, local):
wb = load_workbook(EXCEL_PATH)
ws = wb.active
if matches:
ws[local] = matches
wb.save(EXCEL_PATH)
def get_index(text, pattern, span):
matchs = re.search(pattern, text)
if matchs:
start_index = matchs.end()
remain_text = text[start_index:start_index+span]
return remain_text
def run(pdf_path, excel_path, id):
# 提取PDF文本
text = extract_text_from_pdf(pdf_path)
with open("pdf2txt.txt", "w", encoding="utf-8") as f:
f.write(text)
for pattern, local in RE_LIST:
# 使用正则表达式匹配文本
matches = match_text_with_regex(text, pattern, 1)
fill_excel(matches, excel_path, local)
# 特殊处理的
for p, l, s in SPECIALLIST:
results = get_index(text, p, s)
fill_excel(results, excel_path, l)
# 公共的key
for p, l in COMMON_KEY:
res_list = match_text_with_all(text, p)
for u in range(len(res_list)):
fill_excel(res_list[u], excel_path, l[u])
for p , l in SPECIALLIST_2:
res = match_text_with_match(text, p)
fill_excel(res, excel_path, l)
for p , l in SPECIALLIST_3:
res = match_text_with_group(text, p)
for u in range(len(res)):
fill_excel(res[u], excel_path, l[u])
for p , l in SPEMANY:
res_many = match_many_res(text, p)
if res_many:
for u in range(len(res_many)):
fill_excel(res_many[u], excel_path, l[u])
Parsepdf.objects.filter(id=id).update(status='完成')
return 'success'
if __name__ == "__main__":
# 定义你的正则表达式模式
run("C:\code\pdf_exc\检验检测机构数据查看页2022年.pdf", "C:\code\pdf_exc\检验检测服务业统计数据上报任务-空表.xlsx")