From 69ea0b9c1ace63207a1ad05bf7cd000cb49b8ede Mon Sep 17 00:00:00 2001 From: zty Date: Thu, 25 Apr 2024 17:24:13 +0800 Subject: [PATCH] =?UTF-8?q?feat:=E8=A7=A3=E6=9E=90pdf=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- client/src/api/contacts.js | 17 + client/src/router/index.js | 6 + .../informatiomCollect/laboratoryContact.vue | 83 +++- .../informatiomCollect/qualityActive.vue | 1 - .../informatiomCollect/qualityCommend.vue | 4 +- .../views/informatiomCollect/smsMessage.vue | 5 +- .../views/informatiomCollect/yearReport.vue | 155 +++++++ server/apps/information/models.py | 2 +- server/apps/information/views.py | 43 +- server/pdf2txt.txt | 0 server/utils/pdf2txt.py | 387 ++++++++++++++++++ 11 files changed, 679 insertions(+), 24 deletions(-) create mode 100644 client/src/views/informatiomCollect/yearReport.vue create mode 100644 server/pdf2txt.txt create mode 100644 server/utils/pdf2txt.py diff --git a/client/src/api/contacts.js b/client/src/api/contacts.js index 4143a90..194130d 100644 --- a/client/src/api/contacts.js +++ b/client/src/api/contacts.js @@ -43,3 +43,20 @@ export function getCtAll(query) { method: 'delete' }) } + + + export function impData(data) { + return request({ + url: `/info/contact/imp/`, + method: 'post', + data + }) + } + + export function parsePDF(data) { + return request({ + url: `/info/contact/parse_pdf/`, + method: 'post', + data + }) + } \ No newline at end of file diff --git a/client/src/router/index.js b/client/src/router/index.js index 70b1879..0814226 100644 --- a/client/src/router/index.js +++ b/client/src/router/index.js @@ -457,6 +457,12 @@ export const asyncRoutes = [ component: () => import('@/views/informatiomCollect/externalAuditor.vue'), meta: { title: '外审员情况', perms: ['infoCollect_EA'] } }, + { + path: 'yearReport', + name: 'yearReport', + component: () => import('@/views/informatiomCollect/yearReport.vue'), + meta: { title: '实验室年度报告', perms: ['infoCollect_report'] } + }, { path: 'smsMessage', name: 'smsMessage', diff --git a/client/src/views/informatiomCollect/laboratoryContact.vue b/client/src/views/informatiomCollect/laboratoryContact.vue index cd4ff6c..61fad31 100644 --- a/client/src/views/informatiomCollect/laboratoryContact.vue +++ b/client/src/views/informatiomCollect/laboratoryContact.vue @@ -2,7 +2,8 @@
新增 - 导出 + 导入 + 导出 - - + diff --git a/server/apps/information/models.py b/server/apps/information/models.py index f996c58..f9d5f64 100644 --- a/server/apps/information/models.py +++ b/server/apps/information/models.py @@ -93,7 +93,7 @@ class QualityActivities(CommonBDModel): class Contact(CommonBDModel): - name = models.CharField(max_length=20, unique=True, verbose_name='姓名') + name = models.CharField(max_length=20, unique=True, verbose_name='公司名称') address = models.CharField(max_length=100, verbose_name='地址') header = models.CharField(max_length=20, verbose_name='负责人') tel = models.CharField(max_length=20, verbose_name='负责人电话') diff --git a/server/apps/information/views.py b/server/apps/information/views.py index c5bc50c..a28ed25 100644 --- a/server/apps/information/views.py +++ b/server/apps/information/views.py @@ -12,9 +12,10 @@ from rest_framework.exceptions import ParseError from apps.system.models import Organization from .models import * from .serializers import * - +from utils.pdf2txt import run from datetime import datetime import os +import traceback class ImpMixin: @@ -55,6 +56,7 @@ class ImpMixin: if 'file' not in request.data: raise ParseError('请提供文件') path = request.data['file'] + print(path, "---------ssss") if not str(path).endswith('.xlsx'): raise ParseError('请提供xlsx格式文件') @@ -302,7 +304,7 @@ class QualityActivitiesViewSet(ImpMixin, RbacFilterSet, CreateUpdateCustomMixin, return self.gen_imp_view(request, 2, QualityActivitiesSerializer) -class ContactViewSet(CreateUpdateCustomMixin, ModelViewSet): +class ContactViewSet(ImpMixin, CreateUpdateCustomMixin, ModelViewSet): queryset = Contact.objects.all() serializer_class = ContactSerializer @@ -321,18 +323,18 @@ class ContactViewSet(CreateUpdateCustomMixin, ModelViewSet): data_list = [] for row in sheet.iter_rows(min_row=start, values_only=True): # 假设第一行是表头,从第二行开始读取数据 if row[0] is not None: - activate_time = row[6].strftime("%Y-%m-%d") - role_dict = {"组织方":0, "参与方":1} serializer_data = { 'name': row[1], # 第一列是名字 - 'roles':role_dict.get(row[2]), - 'collaborators':row[3], - 'orgunits':row[4], - 'place':row[5], - 'activate_time':activate_time, - 'participations':row[7], - 'function':row[8], - 'earnings':row[9] + 'address':row[2], + 'header':row[3], + 'tel':row[4], + 'email':row[5], + 'head_technology':row[6], + 'tel_technology':row[7], + 'email_technology':row[8], + 'head_quality':row[9], + 'tel_quality':row[10], + 'email_quality':row[11], } data_list.append(serializer_data) return data_list @@ -344,6 +346,23 @@ class ContactViewSet(CreateUpdateCustomMixin, ModelViewSet): """ return self.gen_imp_view(request, 2, ContactSerializer) + #解析pdf到excel + @action(detail=False, methods=['post']) + @transaction.atomic + def parse_pdf(self, request, *args, **kwargs): + """ + 解析pdf到excel + """ + try: + pdf_file = request.data['pdf_file'] + excel_file = request.data['excel_file'] + # 读数据路径copy 在media 下新建excel,解析完成后存入数据库。 + run(pdf_file, excel_file) + except Exception: + traceback.print_exc() + return Response({"message":"解析失败"}, status = status.HTTP_400_BAD_REQUEST) + return Response({"message":"解析成功", "url":excel_file}, status = status.HTTP_200_OK) + class ExternalAuditorsViewSet(ImpMixin, RbacFilterSet, CreateUpdateCustomMixin, ModelViewSet): queryset = ExternalAuditors.objects.all() diff --git a/server/pdf2txt.txt b/server/pdf2txt.txt new file mode 100644 index 0000000..e69de29 diff --git a/server/utils/pdf2txt.py b/server/utils/pdf2txt.py new file mode 100644 index 0000000..03108de --- /dev/null +++ b/server/utils/pdf2txt.py @@ -0,0 +1,387 @@ +""" +Author: tianyang.zhang +Date: 2024-04-25 +param: {pdf_file:输入PDF文件路径,excel_file:输入Excel文件路径} +description: 提取PDF文件中的文字信息,并将其保存到Excel文件中,正则表达式提取信息。 + +""" +import re +import fitz # PyMuPDF +import re +from openpyxl import load_workbook + +RE_LIST = [ + [r"法定代表人(单位负责人):(.*?)\n", "C4"], + [r"法定代表人(单位负责人):(.*?)\n", "C22"], + [r"本机构名称:(.*?)\n","C5"], + [r"单位注册地址及行政区划 (.*?)\n","C8"], + [r" 单位所在地址及行政区划 (.*?)\n","C9"], + [r"执行会计制度:(.*?)\n","C7"], + [r"单位类型\(01-1\): (.*?)\n", "C16"], + [r"法人单位名称\(02-1\):(.*?)\n", "C17"], + [r"4\):(.*?)\n", "F16"], + [r"法人单位行业分类代码(四位数字):(.*?)\n", "F17"], + [r"固定电话:(.*?)\n", "F24"], + [r"传真号码:(.*?)\n", "F25"], + [r"电子邮箱:(.*?)\n", "F26"], + [r"登记注册类型:(.*?)\n", "F28"], + [r"所属管辖区:(.*?)\n", "C27"], + [r"机构类型:(.*?)\n", "C29"], + [r"是否由事业单位转企改制而来:(.*?)\n", "F29"], + [r"企业控股情况:(.*?)\n", "C32"], + [r"营业状态:(.*?)\n", "C33" ], + [r"是否高新技术企业认定:(.*?)\n", "C35"], + [r"认定或复审高新技术企业证书编号:(.*?)\n", "C36"], + [r"高新技术企业的日期:(.*?)\n", "F36"], + [r"是否境内上市和在新三板挂牌?(.*?)\n", "C38"], + [r"上市挂牌时间:(.*?)\n", "H39"], + [r"保费支出(\d+\.\d+?)万元\n", "F41"], + [r"检验检测机构责\n任险:\n(.*?)\n", "C41"], + [r"检验检测机构责\n任险:\n(.*?)\n", "C41"], + [r"检验检测人员职\n业责任险:\n(.*?)\n", "C42"], + [r"保费支出(\d+\.\d+?)万元\n", "F42"], + [r"企业集团:(.+?)\n", "C30"], + [r"领域一(.+?)\n", "C52"], + [r"领域二(.+?)\n", "C53"], + [r"领域三(.+?)\n", "C54"], + [r"领域四(.+?)\n", "C55"], + [r"领域五(.+?)\n", "C56"], + [r"其他领域描述:(.+?)\n", "C58"], + [r"检验检测服务关键词:(.*?)\n", "C60"], + [r"本检验检测机构当年接受各类技术评审:(.*?)次", "E85"], + [r"接受资质认定技术\n评审:\n(.*?)次", "E86"], + [r"管部门组织检查:\n(.*?)次", "E87"], + [r"技术监督部门(市\n场监管部门)组织\n评审:\n(.*?)次", "E88"], + [r"接受行业管理部门\n组织技术评审:\n(.*?)次", "E89"], + [r"其中国家行业管\n理部门组织评审:\n(.+?)次", "E90"], + [r"本检验检测机构当年接受各类监督检查(.*?)项", "E94"], + [r"接受资质认定专项监督检查:\n(.*?)项", "E95"], + [r"其中国家市场监管部门组织检查:\n(.*?)项", "E96"], + [r"省级及以下质量技术监督部门(市场监管部门)\n组织检查:\n(.*?)项", "E97"], + [r"接受行业管理部门监督检查:\n(.*?)项", "E98"], + [r"其中国家行业管理部门组织检查:\n(.*?)项", "E99"], + [r"省级及以下行业管理部门组织检查:\n(.*?)项", "E100"], + [r"接受其他社会组织、团体及境内外评价机构监督检\n查:\n(.*?)项", "E101"], + [r"本单位是否处于国家检验检测认证公共服务平台示范区:(.*?)\n", "E102"], + [r"本单位处于工业园区(开发区):(.*?)\n", "E103"], + [r"是否加入国内外产业联盟:(.*?)\n", "C104"], + [r"产业联盟1名称:(.*?)\n", "C105"], + [r"产业联盟2名称:(.*?)\n", "E105"], + [r"产业联盟3名称:(.*?)\n", "C106"], + [r"产业联盟4名称:(.*?)\n", "E106"], + [r"是否加入国内外行业协会组织:(.*?)\n", "E107"], + [r"存货\n01\n(.*?)\n", "D111"], + [r"固定资产原值\(原价\)\n02\n(.*?)\n", "D112"], + [r"累计折旧\n03\n(.*?)\n", "D113"], + [r"其中,本年折旧\n04\n(.*?)\n", "D114"], + [r"资产总计\n05\n(.*?)\n", "D115"], + [r"负债合计\n06\n(.*?)\n", "D116"], + [r"营业收入\n07\n(.*?)\n", "D117"], + [r"营业成本\n08\n(.*?)\n", "D118"], + [r"营业税金及附加\n09\n(.*?)\n", "D119"], + [r"销售费用\n10\n(.*?)\n", "D120"], + [r"管理费用\n11\n(.*?)\n", "D121"], + [r"其中,税金\n12\n(.*?)\n", "D122"], + [r"差旅费\n13\n(.*?)\n", "D123"], + [r"财务费用\n14\n(.*?)\n", "D124"], + [r"其中,利息净支出\n15\n(.*?)\n", "D125"], + [r"资产减值损失\n16\n(.*?)\n", "D126"], + [r"公允价值变动收益\n17\n(.*?)\n", "D127"], + [r"投资收益\n18\n(.*?)\n", "D128"], + [r"其他收益\n19\n(.*?)\n", "D129"], + [r"营业利润\n20\n(.*?)\n", "D130"], + [r"营业外收入\n21\n(.*?)\n", "D131"], + [r"营业外支出\n23\n(.*?)\n", "D132"], + [r"营业外支出\n23\n(.*?)\n", "D132"], + [r"利润总额\n24\n(.*?)\n", "D133"], + [r"净利润\n25\n(.*?)\n", "D134"], + [r"所得税费用\n26\n(.*?)\n", "D135"], + [r"应付职工薪酬\(本期贷方\n累计发生额\)\n27\n(.*?)\n", "D136"], + [r"本年应交增值税\(本期累\n计发生额\)\n28\n(.*?)\n", "D137"], + [r"检验检测业务活动类型\(可多选\)\(00-1\):(.*?)\n", "C140"], + [r"检验检测业务活动特点\(00-2\)(.*?)\n", "C141"], + [r"检验检测报告数合计:\n(.*?)份", "F144"], + [r"合格的报告数:(.*?)份\n", "F145"], + [r"其中,为省\(自治区、直辖市\)外出具\n检验检测报告数:\n(.*?)份", "F146"], + [r"行政执法或政府委托检验检测\n报告份数(.*?)份", "F147"], + [r"收入\s(\d+\.\d+)万元\(01-101-2\)\n", "I147"], + [r"其中,当年承担产品质量国家监督抽查\n工作出具检验检测报告数:\n(.*?)份", "F148"], + [r"其中,当年承担产品质量地方监督抽查\n工作出具检验检测报告数:\n(.*?)份", "F149"], + [r"其中,当年承担3C强制性认证检验检\n测报告数:\n(.*?)份", "F150"], + [r"其中,当年承担生产许可证检验检测报\n告数数:\n(.*?)份", "F151"], + [r"社会委托检验检测\n报告份数(.*?)份", "F167"], + [r"\(01-102-1\) 收入 (\d+\.\d+) 万元\(01-102-2\)\n", "I167"], + [r"司法鉴定、仲裁检验检测\n报告份数(.*?)份","F170"], + [r"01-103-1\)\n收入 (\d+\.\d+) 万元\(01-103-2\)\n","I170"], + [r"其他技术服务\n报告份数 (\d+) 份","F172"], + [r"01-104-1\)\n收入 (\d+\.\d+) 万元\(01-104-2\)\n", "I172"], + [r"是否服务制造业企业\(选是,继续填报以下\n内容\)\n(.*?)\n", "E174"], + [r"服务制造业企业的业务收入占总收入:\n(.*?)%", "E175"], + [r"服务制造业企业的业务收入较去年\n增长:\n(.*?)%", "H175"], + [r"是否服务个人消费者\(选是,继续填报以下\n内容\)\n(.*?)\n", "H176"], + [r"服务个人消费者的业务收入占总收入:\n(\d+\.\d+)%", "E177"], + [r"服务个人消费者的业务收入较去年年\n增长:\n(\d+\.\d+)%", "H177"], + [r"是否为高技术产业(制造业)提供检验检测服\n务\n(.*?)\n", "E178"], + [r"是否为高技术产业\(制造业\)提供检验检测服\n务\n(.*?)\n", "E187"], + [r"新一代信息技术产业,业务比重:\n(.*?)%", "E190"], + [r"高端装备,业务比重:\n(.*?)%", "E191"], + [r"新材料,业务比重:\n(.*?)%", "E192"], + [r"生物产业,业务比重:\n(.*?)%", "E193"], + [r"新能源汽车,业务比重:\n(.*?)%", "E194"], + [r"新能源产业,业务比重:\n(.*?)%", "E195"], + [r"节能环保,业务比重:\n(.*?)%", "E196"], + [r"全部仪器设备\n(.*?)台套", "D200"], + [r"其中,50万元\n以上仪器设备\n(.*?)台套", "D201"], + [r"其中,进口仪\n器设备\n(.*?) 台套", "D205"], + [r"全部仪器设备资\n产原值\n(\d+\.\d+) 万元", "D206"], + [r"其中,50万元\n以上仪器设备资产\n原值\n(\d+\.\d+) 万元", "D207"], + [r"50-100万元\n仪器设备资产\(在\n用\)原值\n(\d+\.\d+) 万元", "D208"], + [r"其中,进口仪\n器设备资产原值\n(\d+\.\d+) 万元", "D217"], + [r"与检验检测相关\n的固定资产原值\n\(设备\)\n(\d+\.\d+) 万元", "D218"], + [r"当年新增仪器设\n备\n(.*?)台套", "D219"], + [r"其中,当年新\n增50万元以上仪器\n设备\(设备\)\n(.*?)台套", "D220"], + [r" 其中,50-\n100万元仪器设备\n(.*?)台套", "D221"], + [r" 其中,200\n万以上仪器设备\n(.*?)台套", "D223"], + [r"当年新增仪器设\n备原值合计\n(.*?)万元", "D224"], + [r"其中,当年新增\n50万元以上仪器设\n备资产原值\n(.*?)万元", "D225"], + [r"其中,50-100\n万元仪器设备资产\n原值\n(.*?)万元", "D226"], + [r"其中,100-\n200万仪器设备资产\n原值\n(.*?)万元", "D227"], + [r"其中,200万\n元以上仪器设备资\n产原值\n(.*?)万元", "D228"], + [r"机构总面积\n(.*?)平方米", "D231"], + [r"其中办公面积(.*?)平方米", "D232"], + [r"实验室面积\n(.*?)平方米", "D233"], + [r"其中,恒温\n恒湿实验室\n(.*?)平方米", "D234"], + [r"其中,P2\n以上生物安全实验\n室\n(.*?)平方米", "D235"], + [r"其中,二恶\n英实验室\n(.*?)平方米", "D236"], + [r"其中,电磁\n屏蔽实验室\n(.*?)平方米", "D237"], + [r"其中,消声\n实验室\n(.*?)平方米", "D238"], + [r"其中,放射\n性实验室\n(.*?)平方米", "D239"], + [r" 其中,动物\n房\n(.*?)平方米", "D240"], + [r"专用室外试验\n场\n(.*?)平方米", "D241"], + [r"参数(.*?)项", "C244"], + [r"产品标准(.*?)项", "E244"], + [r"方法标准(.*?)项", "G244"], + [r"检验检测从业人员期末人数(.*?)人", "D246"], + [r"其中:研究生及\n以上学历\(03-401\)(.*?)人", "D247"], + [r"大学本科\n学历\(03-402\)\n(.*?)人", "D248"], + [r"专科及以\n下学历\(03-403\)\n(.*?)人", "D249"], + [r"其中:高级技术\n职称人员\(03-404\)\n(.*?)人", "D250"], + [r"中级技术\n职称人员\(03-405\)\n(.*?)人", "D251"], + [r"初级技术\n职称人员\(03-406\)\n(.*?)人", "D252"], + [r"具备中级\n技术职称同等水平\n的技术能力人员\n(.*?)人", "D253"], + [r"其他\(03-\n407\)\n(.*?)人", "D254"], + [r"其中:授权签字人\n\(03-408\)\n(.*?)人", "D255"], + [r"管理人员\n\(03-409\)\n(.*?)人", "D256"], + [r"检验检测\n\技术人员\(03-410\)\n(.*?)人", "D257"], + [r"其中:两院院士\n\(03-411\)\n(.*?)人", "D259"], + [r"选人员\(03-413\)\n(.*?)人", "D260"], + [r"其他:\n(.*?)研发活动及相关\(03-5\)\n","D261"], + [r"当年专利申请受\n理数\n(.*?)件", "D264"], + [r"其中:当年发明\n专利申请受理数\n(.*?)件", "D265"], + [r"其中:申请欧美\n日专利\n(.*?)件", "D266"], + [r"其中:申请PCT\n国际专利\n(.*?)件", "D267"], + [r"当年专利授权书\n数\n(.*?)件", "D268"], + [r"其中,当年发明\n专利授权数\n(.*?)件", "D269"], + [r"其中:授权欧美\n日专利\n(.*?)件", "D270"], + [r"期末有效专利数\n(.*?)件", "D271"], + [r"其中:期末有效\n发明专利数\n(.*?)件", "D272"], + [r"其中:拥有境外\n授权专利\n(.*?)件", "D273"], + [r"期末拥有注册商\n标\n(.*?)件", "D274"], + [r"其中:当年注\n册商标\n(.*?)件", "D275"], + [r"其中:境外注\n册商标\n(.*?)件", "D276"], + [r"其中:驰名商\n标\n(.*?)件", "D277"], + [r"马德里商标国\n际注册申请量\n(.*?)件", "D278"], + [r"拥有软件著作权\n(.*?)件", "D279"], + [r"其中:当年获\n得软件著作权\n(.*?)件", "D280"], + [r"是否获得本年度\n国务院国家科学技\n术奖:\n(.*?)\n", "E282"], + [r"市人民政府设立的\n省级科学技术奖:\n(.*?)\n", "E284"], + [r" 其他\n(.*?)04\n","E287"], + [r"主要服务地域\(04-1\):(.*?)\n", "D289"], + [r"主要客户类型\(可多选\)\(04-2\):(.*?)\n", "D290"], + [r"\s+科研项目总计(.*?)项", "D293"], + [r"其中,国家级项目(.*?)项", "D294"], + [r"其中,省部级项目(.*?)项", "D295"], + [r"科研经费总计(.*?)万元", "D296"], + [r"其中,国家级项目(.*?)万元", "D297"], + [r"其中,省部级项目(.*?)万元", "D298"], + [r"标准制修订经费总计(.*?)万元", "D300"], + [r"其中,国家标准(.*?)项", "D301"], + [r"其中,行业标准(.*?)项", "D302"], + [r"其中,地方标准(.*?)项", "D303"], + [r"其中,国际标准(.*?)项", "D304"], + [r"其中,国际标准(.*?)项", "D304"], + [r"本机构人员是否在认证认可、检验检测相关领域国际标准化组织任职(.*?)\(05-202\)", "D305"], + [r"人员姓名:(.*?)\(05-203-1\)", "C306"], + [r"担任职务:(.*?)\(05-203-2\)", "C307"], + [r"本年度参加能力\n验证计划合计\n(.*?)项", "D313"], + [r"其中,国家级\n能力验证项目\n(.*?)项", "D314"], + [r"市场监管总局\n\(国家认监委\)能\n力验证项目\n(.*?)项", "D315"], + [r"国家有关行业\n主管部门能力验证\n项目\n(.*?)项", "D316"], + [r"省级能力验证\n项目\n(.*?)项", "D317"], + [r"国内能力验证\n提供者项目\n(.*?)项", "D318"], + [r"国际能力验证\n提供者和国家相关\n组织项目\n(.*?)项", "D319"], + [r"其他能力验证\n项目\n(.*?)项", "D320"], + [r"参加测量审核合\n计\n(.*?)项", "D321"], + [r"机构本年度是否发生变更(.*?)\(07-1\)", "D323"], + [r"是否工业和信息化部认定的“工业产品质量控制和技术评价实验室”(.*?)\n", "D344"], + [r"是否工业和信息化部认定的“工业产品质量控制和技术评价实验室”(.*?)\n", "D344"], + [r"实验室名称:(.*?)所属行业", "C345"], + [r"所属行业:(.*?)授牌","E345"], + [r"授牌\n年份:(.*?)\n","G345"], + [r"是否通过互联网开展检验检测业务\?(.*?)\n","D347"], + [r"单位负责人:(.*?)\n","C351"], + [r"财务负责人:(.*?)\n","E351"], + [r"填表人:(.*?)\n","H351"], + [r"单位负责人电话:(.*?)\n","C352"], + [r"财务负责人电话:(.*?)\n","E352"], + [r"填表人电话:(.*?)\n","H352"], + [r"资质认定联系人座\n机:(.*?)资质认定联系人手\n","C353"], + [r"资质认定联系人手\n机:(.*?)\n","E353"], + [r"资质认定联系人邮\n箱:(.*?)\n","H353"], + [r"资质认定联系人姓\n名:(.*?)\n","C354"], + ] +SPECIALLIST = [ + [r"开业\(成立\)时间:", "F22", 11], + [r"是否含有外资:", "C28", 2], + [r"执行企业会计准则情况:", "F31", 15], + [r"境外认可机构颁发证书:", "E68", 4], + [r"质检中\n心:", "C82", 4], + [r"组织评审:;(.*?)", "E91", 4], + [r"接受认可机构评审:(.*?)", "E92", 4], + [r"评价机构评审:", "E93", 4], + [r"行业协会1名称:", "C108", 11], + [r"行业协会2名称:", "E108", 11], + [r"其中,100-\n200万仪器设备\n", "D222", 2], + [r"是否愿意将仪器\n设备对外共享:\n", "D229", 2], + ] +COMMON_KEY = [ + [r"其他地址:(.*?)\n", ["C10", "C11", "C12"]], + [r"闲置仪器设\n备原值\n (\d+\.\d+) 万元", ["D209","D212", "D215"]] + ] +SPECIALLIST_2 = [ + [r"长途区\n(.*?)\n", "C24"], + [r"移动电\n(.*?)\n", "C25"], + [r"邮政编\n(.*?)\n", "C26"], + ] +SPECIALLIST_3 = [ + [r"收\n入\n占\n比\n(.*?)\n", ["E52", "E53", "E54", "E55", "E56"]], + [r"业\n务\n饱\n和\n度\n(.*?)\n", ["H52", "H53", "H54", "H55", "H56"]], + [r"构资质认定证书:\n(.*?)\n", ["E61"]], + [r"颁发的资质认定证书:\n(.*?)\n", ["E62"]], + [r"国家市场监督管理部门颁发的特种设备检验检测机构核准证:\n(.*?)\n", ["E63"]], + [r"省级市场监督管理部门颁发的特种设备检验检测机构核准证:\n(.*?)\n", ["E64"]], + [r"资质、资格证书:\n(.*?)\n", ["E65"]], + [r"中国合格评定国家认可中心颁发的:\n(.*?)\n", ["E66"]], + [r"其他社会组织、团体颁发的证书:\n(.*?)\n", ["E67"]], + [r"\s+其中,个人委托检验检测报告:\n(.*?)\n", ["F168"]], + ["其中,单位委托检验检测报告:\n(.*?)\n", ["F169"]], + [r"其中,出具司法鉴定意见书:\n(.*?)\n",["F171"]], + ] +SPEMANY = [ + [r"50-100万元\n仪器设备在用\n(.*?)台套 ,闲置 (.*?) 台套 ,待报废 (.*?) 台套", ["D202", "G202","I202"]], + [r"100-200万仪\n器设备在用\n(.*?)台套 ,闲置 (.*?) 台套 ,待报废 (.*?) 台套", ["D203", "G203","I203"]], + [r" 200万元以上\n仪器设备在用\n(.*?)台套 ,闲置 (.*?) 台套 ,待报废 (.*?) 台套", ["D204", "G204","I204"]], + [r"50-100万元\n仪器设备资产\(在\n用\)原值\n(\d+\.\d+) 万元\n\s*闲置仪器设\n备原值\n(\d+\.\d+) 万元\n\s*待报废仪器\n设备原值\n(\d+\.\d+) 万元", ["D208", "D209", "D210"]], + [r"100-200万仪\n器设备资产\(在\n用\)原值\n(\d+\.\d+) 万元\n\s*闲置仪器设\n备原值\n(\d+\.\d+) 万元\n\s*待报废仪器\n设备原值\n(\d+\.\d+) 万元", ["D211", "D212", "D213"]], + [r"200万元以上\n仪器设备\(在用\)\n原值\n(\d+\.\d+) 万元\n\s*闲置仪器设\n备原值\n(\d+\.\d+) 万元\n\s*待报废仪器\n设备原值\n(\d+\.\d+) 万元", ["D214", "D215", "D216"]], + ] + + +# 从PDF中提取文本 +def extract_text_from_pdf(pdf_path): + doc = fitz.open(pdf_path) + text = "" + for page in doc: + text += page.get_text() + doc.close() + return text + +# 使用正则表达式匹配文本 +def match_text_with_regex(text, pattern, num:int): + matches = re.search(pattern, text, re.DOTALL) + if matches: + results = matches.group(num).strip() if matches else None + return results + +# 使用正则表达式匹配文本 +def match_text_with_all(text, pattern): + matches = re.finditer(pattern, text) + results = [i.group().strip() if i else None for i in matches] + return results + +def match_text_with_group(text, pattern): + matches = re.finditer(pattern, text) + results = [i.group(1).strip()[:-1] if i else None for i in matches] + return results + +def match_many_res(text, pattern): + matches = re.findall(pattern, text) + if matches: + results = [i if i else None for i in matches[0]] + return results + else: + return '' + +def match_text_with_match(text, pattern): + matches = re.search(pattern, text) + results = matches.group().strip() if matches else None + new_results = results.split(":")[1] + return new_results + +# 将匹配结果填入Excel +def fill_excel(matches, EXCEL_PATH, local): + wb = load_workbook(EXCEL_PATH) + ws = wb.active + if matches: + ws[local] = matches + wb.save(EXCEL_PATH) + +def get_index(text, pattern, span): + matchs = re.search(pattern, text) + if matchs: + start_index = matchs.end() + remain_text = text[start_index:start_index+span] + return remain_text + +def run(pdf_path, excel_path): + # 提取PDF文本 + text = extract_text_from_pdf(pdf_path) + with open("pdf2txt.txt", "w", encoding="utf-8") as f: + f.write(text) + for pattern, local in RE_LIST: + # 使用正则表达式匹配文本 + matches = match_text_with_regex(text, pattern, 1) + print(matches,"------") + fill_excel(matches, excel_path, local) + # 特殊处理的 + for p, l, s in SPECIALLIST: + results = get_index(text, p, s) + fill_excel(results, excel_path, l) + # 公共的key + for p, l in COMMON_KEY: + res_list = match_text_with_all(text, p) + for u in range(len(res_list)): + fill_excel(res_list[u], excel_path, l[u]) + + for p , l in SPECIALLIST_2: + res = match_text_with_match(text, p) + fill_excel(res, excel_path, l) + + for p , l in SPECIALLIST_3: + res = match_text_with_group(text, p) + for u in range(len(res)): + fill_excel(res[u], excel_path, l[u]) + + for p , l in SPEMANY: + res_many = match_many_res(text, p) + if res_many: + for u in range(len(res_many)): + fill_excel(res_many[u], excel_path, l[u]) + else: + return + return '' + + +if __name__ == "__main__": + # 定义你的正则表达式模式 + run("C:\code\pdf_exc\检验检测机构数据查看页2022年.pdf", "C:\code\pdf_exc\检验检测服务业统计数据上报任务-空表.xlsx") \ No newline at end of file