import pandas as pd import os import sqlite3 from mycode.base import BASE_DIR import re from openpyxl import load_workbook from urllib.parse import urlparse from datetime import datetime import numpy as np wechat_dir = os.path.join(BASE_DIR, 'article') web_dir = os.path.join(BASE_DIR, 'web_dir') output_dir = os.path.join(BASE_DIR, 'summary') df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容') def fix_url_scheme(url, default_scheme='http'): # 检查URL是否包含方案 if not url.startswith('http://') and not url.startswith('https://'): # 如果没有方案,添加默认方案 url = f'{default_scheme}://{url}' return url def trans_to_json(): json_str = df_s.to_json(orient='records', force_ascii=False) with open('biao.json', 'w', encoding='utf-8') as f: f.write(json_str) def make_simple_csv_from_db(now: datetime): # 只查找当前月份更新的公众号数据 now_month_str = now.strftime('%Y-%m-%d 00:00:00') conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db')) query = f""" SELECT id, g.nickname, a.title, a.content_url, datetime( a.p_date, 'unixepoch', 'localtime' ) AS pub_date, datetime( g.updated_at, 'unixepoch', 'localtime' ) AS g_updated_at FROM articles a LEFT JOIN gzhs g ON g.biz = a.biz WHERE g_updated_at > '{now_month_str}' """ df = pd.read_sql_query(query, conn) # 关闭数据库连接 conn.close() # 将数据写入CSV文件 df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False) def float_to_int(value): try: return int(value) except: return value def get_cbma_info_from_db_and_ana(year: str = '2023'): # 全年统计数据 zybiz = "MzIzMDU4Njg3MA==" df_fx = pd.DataFrame({"单位": ["中国建材总院", "瑞泰科技", "国检集团", "中材高新", "哈玻院", "中国新材院", "秦皇岛院", "西安墙材院", "咸阳陶瓷院", "钟表所", "总院北分", "中岩科技", "水泥新材院", "中建材科创院", "科建苑", "办公室(董事会办公室)", "党委组织部/人力资源部", "财务部", "科技部", "投资部", "企业管理部、安全环保部", "党群部/宣传统战部", "党风办/巡察办、纪委综合室", "监督执纪室", "审计办公室"], "公众号Biz": [zybiz, "MzU0MzgwMzg1NA==", "MzI1MjYzNDQ3NA==", "MzA5MDkzNDA0NQ==", "Mzg2MDg0NjkwNw==", "MzI3MTY5NTExNA==", "MzI1MzY1Njg5MQ==", "MzIxOTQwNjE2MQ==", "Mzg3OTI0NTYzMA==", "MzA3NTU5NjM2MA==", "", "Mzg2NDgyMDM3OA==", "", "MzA5NTQ5MjY4Nw==", "", "", "", "", "", "", "", "", "", "", "", ], # "供总院稿数": [], "供总院专稿数": [], "供总院组稿数": [], "供总院阅读10000及以上数": [], "供总院阅读5000及以上数": [], "供总院阅读1000及以上数": [], # "1月发布数": [], "1月最高点击文章": [], # "2月发布数": [], "2月最高点击文章": [], # "3月发布数": [], "3月最高点击文章": [], # "4月发布数": [], "4月最高点击文章": [], # "5月发布数": [], "5月最高点击文章": [], # "6月发布数": [], "6月最高点击文章": [], # "7月发布数": [], "7月最高点击文章": [], # "8月发布数": [], "8月最高点击文章": [], # "9月发布数": [], "9月最高点击文章": [], # "10月发布数": [], "10月最高点击文章": [], # "11月发布数": [], "11月最高点击文章": [], # "12月发布数": [], "12月最高点击文章": [], # "半年发布数": [], "半年最高点击文章": [], # "全年发布数": [], "全年最高点击文章": [] }) # 查询所有指定公众号的文章并按年/月排序 conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db')) query_gzhs = f''' SELECT id, g.biz as gbiz, strftime('%Y', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_year, strftime('%m', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_month, strftime('%d', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_day, g.nickname, a.title, a.content_url, a.read_num FROM articles a LEFT JOIN gzhs g ON g.biz = a.biz WHERE pub_year = '{year}' AND g.biz in ({', '.join([f"'{biz}'" for biz in df_fx["公众号Biz"].tolist()])}) ORDER BY pub_year, pub_month, pub_day; ''' df = pd.read_sql_query(query_gzhs, conn) conn.close() # 尝试连接官网库进行查询 import psycopg2 conn_web = None df_web = None try: conn_web = psycopg2.connect( "dbname={} user={} password={} host={} port={}".format('edn_cms', 'auditor', 'Lde78B3_cbma', '10.65.253.10', '54321')) query_web = f""" SELECT a_outer.id, TO_CHAR(a_outer.ctime, 'YYYY') AS pub_year, TO_CHAR(a_outer.ctime, 'MM') AS pub_month, TO_CHAR(a_outer.ctime, 'DD') AS pub_day, a_outer.title, a_outer.source, a_outer.hits, t.title as bankuai, a_outer.src FROM "a_article" a_outer left join ( select id, title, father, path from a_article where father in (20110528, 19080024) ) t on a_outer.father = t.id WHERE a_outer.TYPE = 3 and a_outer.deleted is NULL and EXTRACT ( YEAR FROM a_outer.ctime ) = {year} and bankuai is not NULL ORDER BY a_outer.ctime; """ df_web = pd.read_sql_query(query_web, conn_web) conn_web.close() except Exception as e: print(e) # 追加总院数据来源 for ind, row in df.iterrows(): if row['gbiz'] == zybiz: full_path = os.path.join( wechat_dir, row['nickname'], row['id'] + '.md') try: with open(full_path, encoding='utf-8') as f: content = f.read() # 从content中获取来源 a_match = re.findall('来源丨(.*?)\n', content) if a_match: # a = a_match[0].replace('\xa0', '、').replace(' ', '、') # a = re.sub(r'、+', '、', a) a = re.sub(r'[\xa0\s]+', '、', a_match[0].strip()) df.at[ind, 'source'] = a except FileNotFoundError: print(full_path + '---不存在') df['source'] = df['source'].fillna('') # df_fx['供总院稿数'] = df_fx['单位'].apply(lambda unit: (df['source'].str.contains(unit)).sum()) # df_fx['供总院专稿数'] = df_fx['单位'].apply(lambda unit: (df['source'] == unit).sum()) # df_fx['供总院组稿数'] = df_fx['单位'].apply(lambda unit: ((df['source'].str.contains(unit)&(df['source']!=unit))).sum()) # df_fx['供总院阅读10000及以上数'] = df_fx['单位'].apply(lambda unit: ((df['read_num']>=10000)&(df['source'].str.contains(unit))).sum()) # df_fx['供总院阅读5000及以上数'] = df_fx['单位'].apply(lambda unit: ((df['read_num']>=5000)&(df['read_num']<10000)&(df['source'].str.contains(unit))).sum()) # df_fx['供总院阅读1000及以上数'] = df_fx['单位'].apply(lambda unit: ((df['read_num']>=1000)&(df['read_num']<5000)&(df['source'].str.contains(unit))).sum()) # df_fx['1月发布数'] = df_fx['公众号Biz'].apply(lambda unit: ((df['pub_year']==year)&(df['pub_month']=='01')&(df['gbiz']==unit)).sum() if unit else '') # 更详细的分析 for ind, row in df_fx.iterrows(): dw = row['单位'] gbiz = row['公众号Biz'] # 全年对总院供给统计 # if '、' in dw: # 针对这种同一部门的 # cons = (df['gbiz']==zybiz) # cons_dw_1 = pd.Series(False, index=df.index) # for item in dw.split('、'): # cons_dw_1 = (df['source'].str.contains(item))|cons_dw_1 # df_fx.at[ind, '供总院全年稿数'] = ((cons_dw_1)&(cons)).sum() # else: # df_fx.at[ind, '供总院全年稿数'] = ((df['source'].str.contains(dw))&(df['gbiz']==zybiz)).sum() df_fx.at[ind, '供总院全年专稿数'] = ( (df['source'] == dw) & (df['gbiz'] == zybiz)).sum() df_fx.at[ind, '供总院网站全年专稿数'] = ( (df_web['source'] == dw)).sum() df_fx.at[ind, '供总院全年组稿数'] = ((df['source'].str.contains( dw) & (df['source'] != dw)) & (df['gbiz'] == zybiz)).sum() df_fx.at[ind, '供总院全年网站组稿数'] = ((df_web['source'].str.contains( dw) & (df_web['source'] != dw))).sum() df_fx.at[ind, '供总院全年阅读10000及以上数'] = ((df['read_num'] >= 10000) & ( df['source'].str.contains(dw)) & (df['gbiz'] == zybiz)).sum() df_fx.at[ind, '供总院全年阅读5000及以上数'] = ((df['read_num'] >= 5000) & ( df['read_num'] < 10000) & (df['source'].str.contains(dw)) & (df['gbiz'] == zybiz)).sum() df_fx.at[ind, '供总院全年阅读1000及以上数'] = ((df['read_num'] >= 1000) & ( df['read_num'] < 5000) & (df['source'].str.contains(dw)) & (df['gbiz'] == zybiz)).sum() for i in ['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月', '上半年', '下半年', '全年']: if '月' in i: i_str = i.replace('月', '').zfill(2) cons_y_m = (df['pub_month'] == str(i_str)) cons_y_m_web = (df_web['pub_month'] == str(i_str)) elif i == '上半年': cons_y_m = (df['pub_month'] == '01') | (df['pub_month'] == '02') | (df['pub_month'] == '03') | ( df['pub_month'] == '04') | (df['pub_month'] == '05') | (df['pub_month'] == '06') cons_y_m_web = (df_web['pub_month'] == '01') | (df_web['pub_month'] == '02') | (df_web['pub_month'] == '03') | ( df_web['pub_month'] == '04') | (df_web['pub_month'] == '05') | (df_web['pub_month'] == '06') elif i == '下半年': cons_y_m = (df['pub_month'] == '07') | (df['pub_month'] == '08') | (df['pub_month'] == '09') | ( df['pub_month'] == '10') | (df['pub_month'] == '11') | (df['pub_month'] == '12') cons_y_m_web = (df_web['pub_month'] == '07') | (df_web['pub_month'] == '08') | (df_web['pub_month'] == '09') | ( df_web['pub_month'] == '10') | (df_web['pub_month'] == '11') | (df_web['pub_month'] == '12') elif i == '全年': cons_y_m = pd.Series(True, index=df.index) cons_y_m_web = pd.Series(True, index=df_web.index) if '、' in dw: # 针对这种同一部门的 cons_dw_1 = pd.Series(False, index=df.index) cons_dw_1_web = pd.Series(False, index=df_web.index) for item in dw.split('、'): cons_dw_1 = (df['source'].str.contains(item)) | cons_dw_1 cons_dw_1_web = (df_web['source'].str.contains(item)) | cons_dw_1_web df_fx.at[ind, f'供总院{i}稿数'] = ((cons_dw_1) & ( cons_y_m) & (df['gbiz'] == zybiz)).sum() df_fx.at[ind, f'供总院网站{i}稿数'] = ((cons_dw_1_web) & ( cons_y_m_web)).sum() else: df_fx.at[ind, f'供总院{i}稿数'] = (df['source'].str.contains( dw) & (cons_y_m) & (df['gbiz'] == zybiz)).sum() df_fx.at[ind, f'供总院网站{i}稿数'] = (df_web['source'].str.contains( dw) & (cons_y_m_web)).sum() df_fx[f'供总院{i}稿数'] = df_fx[f'供总院{i}稿数'].fillna(0) df_fx[f'供总院{i}稿数'] = df_fx[f'供总院{i}稿数'].astype(int) df_fx[f'供总院网站{i}稿数'] = df_fx[f'供总院网站{i}稿数'].fillna(0) df_fx[f'供总院网站{i}稿数'] = df_fx[f'供总院网站{i}稿数'].astype(int) if gbiz: # 进行查询 # 条件 cons = (cons_y_m) & (df['gbiz'] == gbiz) cons_sum = (cons).sum() df_fx.at[ind, f'{i}发布数'] = cons_sum df_fx[f'{i}发布数'] = df_fx[f'{i}发布数'].fillna(0) df_fx[f'{i}发布数'] = df_fx[f'{i}发布数'].astype(int) df_fx.at[ind, f'{i}最高点击文章'] = '' if cons_sum: max_read_row = df[cons].loc[df[cons]['read_num'].idxmax()] max_read_row_list = [max_read_row['id'], max_read_row['title'], str( max_read_row['read_num']), f'{max_read_row["pub_year"]}-{max_read_row["pub_month"]}-{max_read_row["pub_day"]}', max_read_row['source']] df_fx.at[ind, f'{i}最高点击文章'] = '***'.join(max_read_row_list) df_fx.at[ind, f'总院网站{i}发布数'] = cons_y_m_web.sum() df_fx[f'总院网站{i}发布数'] = df_fx[f'总院网站{i}发布数'].fillna(0) df_fx[f'总院网站{i}发布数'] = df_fx[f'总院网站{i}发布数'].astype(int) # 矫正数据类型 df_fx = df_fx.applymap(float_to_int) # 先输出原始统计数据 origin_path = os.path.join(BASE_DIR, f'summary/{year}年_汇总分析.xlsx') df_fx.to_excel(origin_path, index=True) # 追加到总院年分析表中 template_path = os.path.join(BASE_DIR, 'summary/template_cbma.xlsx') workbook = load_workbook(template_path) sheet = workbook['公众号更新数'] sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(官微)') ind_zy = 0 for ind, row in df.iterrows(): if row['gbiz'] == zybiz: sheet.cell(row=ind_zy+3, column=1, value=str(ind_zy+1)) sheet.cell(row=ind_zy+3, column=2, value=f'{row["pub_year"]}-{row["pub_month"]}-{row["pub_day"]}') sheet.cell(row=ind_zy+3, column=3, value=row['title']) sheet.cell(row=ind_zy+3, column=4, value=row['source']) sheet.cell(row=ind_zy+3, column=6, value=row['read_num']) sheet.cell(row=ind_zy+3, column=7, value=row['id']) sheet.cell(row=ind_zy+3, column=8, value=row['content_url']) ind_zy = ind_zy + 1 sheet_web = workbook['官方网站更新数'] sheet_web.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(网站)') ind_zyweb = 0 for ind, row in df_web.iterrows(): sheet_web.cell(row=ind_zyweb+3, column=1, value=str(ind_zyweb+1)) sheet_web.cell(row=ind_zyweb+3, column=2, value=f'{row["pub_year"]}-{row["pub_month"]}-{row["pub_day"]}') sheet_web.cell(row=ind_zyweb+3, column=3, value=row['title']) sheet_web.cell(row=ind_zyweb+3, column=4, value=row['source']) sheet_web.cell(row=ind_zyweb+3, column=5, value=row['bankuai']) ind_zyweb = ind_zyweb + 1 cbma_path = os.path.join(BASE_DIR, f'summary/{year}年_总院文章.xlsx') workbook.save(cbma_path) print(f'总院{year}年文章表生成完毕!') template_cal_path = os.path.join( BASE_DIR, 'summary/template_cbma_cal.xlsx') workbook2 = load_workbook(template_cal_path) need_df_list = ["瑞泰科技", "国检集团", "中材高新", "哈玻院", "中国新材院", "秦皇岛院", "西安墙材院", "咸阳陶瓷院", "钟表所", "总院北分", "中岩科技", "水泥新材院", "中建材科创院", "科建苑"] sheet2 = workbook2['打分表'] sheet2.cell(row=1, column=1, value=f'中国建材总院宣传工作计分表({year}年度)') for ind, val in enumerate(need_df_list): row_ind_df_fx = df_fx['单位'].to_list().index(val) sheet2.cell(row=6, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年专稿数']) sheet2.cell(row=7, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院网站全年专稿数']) sheet2.cell(row=10, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年组稿数']) sheet2.cell(row=12, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年阅读10000及以上数']) sheet2.cell(row=13, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年阅读5000及以上数']) sheet2.cell(row=14, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年阅读1000及以上数']) cbma_cal_path = os.path.join(BASE_DIR, f'summary/{year}年_总院打分.xlsx') workbook2.save(cbma_cal_path) print(f'总院{year}年打分表生成完毕!') # need_df_list_full = [ "瑞泰科技股份有限公司", "中国国检测试控股集团股份有限公司", "中材高新材料股份有限公司", "哈尔滨玻璃钢研究院有限公司", "中国新型建材设计研究院有限公司", "秦皇岛玻璃工业研究设计院有限公司", "西安墙体材料研究设计院有限公司", "咸阳陶瓷研究设计院有限公司", "西安轻工业钟表研究所有限公司", "中国建材总院北京分公司", "中建材中岩科技有限公司", "水泥科学与新型建筑材料研究院(中研益)", "中建材科创新技术研究院(山东)有限公司", "北京科建苑物业管理有限公司"] dw_list = df_fx['单位'].to_list() template_month_path = os.path.join(BASE_DIR, 'summary/template_month.xlsx') workbook3 = load_workbook(template_month_path) for i in ['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月', '上半年', '下半年', '全年']: try: sheet = workbook3[i] except KeyError: sheet = workbook3.copy_worksheet(workbook3['1月']) sheet.title = i sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院各企业新媒体更新情况统计表\n({i})') # 开始总院填充数据 sheet.cell(row=4, column=3, value=df_fx.at[0, f'{i}发布数']) sheet.cell(row=4, column=2, value=df_fx.at[0, f'总院网站{i}发布数']) max_read_row = df_fx.at[dw_list.index('中国建材总院'), f'{i}最高点击文章'] if max_read_row: _, title, read_num, pub_date, source = max_read_row.split('***') sheet.cell(row=7, column=2, value=title) sheet.cell(row=7, column=4, value=read_num) sheet.cell(row=7, column=5, value=pub_date) sheet.cell(row=7, column=6, value=source) # 开始填充各单位数据 sheet.cell(row=14, column=3, value=df_fx.at[dw_list.index('瑞泰科技'), f'{i}发布数']) sheet.cell(row=14, column=6, value=df_fx.at[dw_list.index('瑞泰科技'), f'供总院{i}稿数']) sheet.cell(row=14, column=5, value=df_fx.at[dw_list.index('瑞泰科技'), f'供总院网站{i}稿数']) sheet.cell(row=15, column=3, value=df_fx.at[dw_list.index('国检集团'), f'{i}发布数']) sheet.cell(row=15, column=6, value=df_fx.at[dw_list.index('国检集团'), f'供总院{i}稿数']) sheet.cell(row=15, column=5, value=df_fx.at[dw_list.index('国检集团'), f'供总院网站{i}稿数']) sheet.cell(row=16, column=3, value=df_fx.at[dw_list.index('中材高新'), f'{i}发布数']) sheet.cell(row=16, column=6, value=df_fx.at[dw_list.index('中材高新'), f'供总院{i}稿数']) sheet.cell(row=16, column=5, value=df_fx.at[dw_list.index('中材高新'), f'供总院网站{i}稿数']) sheet.cell(row=17, column=3, value=df_fx.at[dw_list.index('哈玻院'), f'{i}发布数']) sheet.cell(row=17, column=6, value=df_fx.at[dw_list.index('哈玻院'), f'供总院{i}稿数']) sheet.cell(row=17, column=5, value=df_fx.at[dw_list.index('哈玻院'), f'供总院网站{i}稿数']) sheet.cell(row=18, column=3, value=df_fx.at[dw_list.index('中国新材院'), f'{i}发布数']) sheet.cell(row=18, column=6, value=df_fx.at[dw_list.index('中国新材院'), f'供总院{i}稿数']) sheet.cell(row=18, column=5, value=df_fx.at[dw_list.index('中国新材院'), f'供总院网站{i}稿数']) sheet.cell(row=19, column=3, value=df_fx.at[dw_list.index('秦皇岛院'), f'{i}发布数']) sheet.cell(row=19, column=6, value=df_fx.at[dw_list.index('秦皇岛院'), f'供总院{i}稿数']) sheet.cell(row=19, column=5, value=df_fx.at[dw_list.index('秦皇岛院'), f'供总院网站{i}稿数']) sheet.cell(row=20, column=3, value=df_fx.at[dw_list.index('西安墙材院'), f'{i}发布数']) sheet.cell(row=20, column=6, value=df_fx.at[dw_list.index('西安墙材院'), f'供总院{i}稿数']) sheet.cell(row=20, column=5, value=df_fx.at[dw_list.index('西安墙材院'), f'供总院网站{i}稿数']) sheet.cell(row=21, column=3, value=df_fx.at[dw_list.index('咸阳陶瓷院'), f'{i}发布数']) sheet.cell(row=21, column=6, value=df_fx.at[dw_list.index('咸阳陶瓷院'), f'供总院{i}稿数']) sheet.cell(row=21, column=5, value=df_fx.at[dw_list.index('咸阳陶瓷院'), f'供总院网站{i}稿数']) sheet.cell(row=22, column=3, value=df_fx.at[dw_list.index('钟表所'), f'{i}发布数']) sheet.cell(row=22, column=6, value=df_fx.at[dw_list.index('钟表所'), f'供总院{i}稿数']) sheet.cell(row=22, column=5, value=df_fx.at[dw_list.index('钟表所'), f'供总院网站{i}稿数']) # sheet.cell(row=23, column=3, value=df_fx.at[dw_list.index('总院北分'), f'{i}发布数']) sheet.cell(row=23, column=6, value=df_fx.at[dw_list.index('总院北分'), f'供总院{i}稿数']) sheet.cell(row=23, column=5, value=df_fx.at[dw_list.index('总院北分'), f'供总院网站{i}稿数']) sheet.cell(row=24, column=3, value=df_fx.at[dw_list.index('中岩科技'), f'{i}发布数']) sheet.cell(row=24, column=6, value=df_fx.at[dw_list.index('中岩科技'), f'供总院{i}稿数']) sheet.cell(row=24, column=5, value=df_fx.at[dw_list.index('中岩科技'), f'供总院网站{i}稿数']) # sheet.cell(row=25, column=3, value=df_fx.at[dw_list.index('水泥新材院'), f'{i}发布数']) sheet.cell(row=25, column=6, value=df_fx.at[dw_list.index('水泥新材院'), f'供总院{i}稿数']) sheet.cell(row=25, column=5, value=df_fx.at[dw_list.index('水泥新材院'), f'供总院网站{i}稿数']) sheet.cell(row=26, column=3, value=df_fx.at[dw_list.index('中建材科创院'), f'{i}发布数']) sheet.cell(row=26, column=6, value=df_fx.at[dw_list.index('中建材科创院'), f'供总院{i}稿数']) sheet.cell(row=26, column=5, value=df_fx.at[dw_list.index('中建材科创院'), f'供总院网站{i}稿数']) # sheet.cell(row=27, column=3, value=df_fx.at[dw_list.index('科建苑'), f'{i}发布数']) sheet.cell(row=27, column=6, value=df_fx.at[dw_list.index('科建苑'), f'供总院{i}稿数']) sheet.cell(row=27, column=5, value=df_fx.at[dw_list.index('科建苑'), f'供总院网站{i}稿数']) sheet.cell(row=29, column=2, value=df_fx.at[dw_list.index('办公室(董事会办公室)'), f'供总院{i}稿数']) sheet.cell(row=30, column=2, value=df_fx.at[dw_list.index('党委组织部/人力资源部'), f'供总院{i}稿数']) sheet.cell(row=31, column=2, value=df_fx.at[dw_list.index('财务部'), f'供总院{i}稿数']) sheet.cell(row=32, column=2, value=df_fx.at[dw_list.index('科技部'), f'供总院{i}稿数']) sheet.cell(row=33, column=2, value=df_fx.at[dw_list.index('投资部'), f'供总院{i}稿数']) sheet.cell(row=29, column=7, value=df_fx.at[dw_list.index('企业管理部、安全环保部'), f'供总院{i}稿数']) sheet.cell(row=30, column=7, value=df_fx.at[dw_list.index('党群部/宣传统战部'), f'供总院{i}稿数']) sheet.cell(row=31, column=7, value=df_fx.at[dw_list.index('党风办/巡察办、纪委综合室'), f'供总院{i}稿数']) sheet.cell(row=32, column=7, value=df_fx.at[dw_list.index('监督执纪室'), f'供总院{i}稿数']) sheet.cell(row=33, column=7, value=df_fx.at[dw_list.index('审计办公室'), f'供总院{i}稿数']) cbma_month_path = os.path.join(BASE_DIR, f'summary/{year}年_单位月度.xlsx') workbook3.save(cbma_month_path) print(f'总院{year}年月度表生成完毕!') return origin_path, cbma_path, cbma_cal_path, cbma_month_path def make_wechat_articles_full(): df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv')) df['content'] = '' for ind, row in df.iterrows(): full_path = os.path.join( wechat_dir, row['nickname'], row['id'] + '.md') try: with open(full_path, encoding='utf-8') as f: df.at[ind, 'content'] = f.read() except FileNotFoundError: print(full_path + '---不存在') output_path = os.path.join(wechat_dir, 'articles_full.csv') df.to_csv(output_path) def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') if not os.path.exists(articles_full_path): make_wechat_articles_full() df = pd.read_csv(articles_full_path) df['content'] = df['content'].fillna('') output_data = [] index = 1 for ind, row in df_s.iterrows(): mask = df['content'].str.contains(row['错误表述']) result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']: continue if row['错误表述'] == '20大': continue output_row = [ index, row2['nickname'], row2['title'], row['错误表述'], row['建议修改词语'], row['错误分类'], row2['content_url'] ] output_data.append(output_row) index += 1 print(f'找到公众号问题{index}---{row2["nickname"]}') # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) return output_data def find_title(text): match = re.search(r'#\s*.*', text, re.MULTILINE) if match: return match.group(0).strip() # 去除两边的空白字符 return "/" def ana_web(): output_data = [] index = 1 # for file in os.listdir(web_dir): # full_path = os.path.join(web_dir, file) # if '$' in full_path: # continue # print(full_path) # 只分析websites中的 df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = fix_url_scheme(row['地址'].strip()) domain = urlparse(url).netloc.replace('www.', '') full_path = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') if os.path.exists(full_path) and os.path.getsize(full_path) > 0: df = pd.read_excel(os.path.join(full_path), engine='openpyxl') for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述'], na=False) result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']: continue if row['错误表述'] == '20大': continue output_row = [ index, row2['name'], find_title(row2['text']), row['错误表述'], row['建议修改词语'], row['错误分类'], row2['url'] ] output_data.append(output_row) index += 1 print(f'找到官网问题{index}---{row2["name"]}') # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) return output_data if __name__ == "__main__": # get_cbma_info_from_db_and_ana() import psycopg2 conn = None try: conn = psycopg2.connect( "dbname={} user={} password={} host={} port={}".format('edn_cms', 'auditor', 'Lde78B3_cbma', '10.65.253.10', '54321')) cur = conn.cursor() year = 2023 query = f""" SELECT a_outer.id, TO_CHAR(a_outer.ctime, 'YYYY-MM-DD') AS ctime, a_outer.title, a_outer.source, a_outer.hits, t.title as bankuai, a_outer.src FROM "a_article" a_outer left join ( select id, title, father, path from a_article where father in (20110528, 19080024) ) t on a_outer.father = t.id WHERE a_outer.TYPE = 3 and a_outer.deleted is NULL and EXTRACT ( YEAR FROM a_outer.ctime ) = {year} and bankuai is not NULL ORDER BY a_outer.ctime desc; """ df = pd.read_sql_query(query, conn) print(df) cur.close() except Exception as e: pass