zcspider/mycode/main.py

411 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import os
import sqlite3
from mycode.base import BASE_DIR
import re
from openpyxl import load_workbook
from urllib.parse import urlparse
from datetime import datetime
import numpy as np
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
output_dir = os.path.join(BASE_DIR, 'summary')
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
# 如果没有方案,添加默认方案
url = f'{default_scheme}://{url}'
return url
def trans_to_json():
json_str = df_s.to_json(orient='records', force_ascii=False)
with open('biao.json', 'w', encoding='utf-8') as f:
f.write(json_str)
def make_simple_csv_from_db(now: datetime):
# 只查找当前月份更新的公众号数据
now_month_str = now.strftime('%Y-%m-%d 00:00:00')
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
query = f"""
SELECT
id,
g.nickname,
a.title,
a.content_url,
datetime( a.p_date, 'unixepoch', 'localtime' ) AS pub_date,
datetime( g.updated_at, 'unixepoch', 'localtime' ) AS g_updated_at
FROM
articles a
LEFT JOIN gzhs g ON g.biz = a.biz
WHERE
g_updated_at > '{now_month_str}'
"""
df = pd.read_sql_query(query, conn)
# 关闭数据库连接
conn.close()
# 将数据写入CSV文件
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
def float_to_int(value):
try:
return int(value)
except:
return value
def get_cbma_info_from_db_and_ana(year: str = '2023'):
# 全年统计数据
zybiz = "MzIzMDU4Njg3MA=="
df_fx = pd.DataFrame({"单位": [ "中国建材总院",
"瑞泰科技", "国检集团", "中材高新", "哈玻院", "中国新材院", "秦皇岛院", "西安墙材院", "咸阳陶瓷院", "钟表所", "总院北分", "中岩科技", "水泥新材院", "中建材科创院", "科建苑", "办公室(董事会办公室)", "党委组织部/人力资源部", "财务部", "科技部", "投资部", "企业管理部、安全环保部", "党群部/宣传统战部",
"党风办/巡察办、纪委综合室", "监督执纪室", "审计办公室"],
"公众号Biz": [zybiz, "MzU0MzgwMzg1NA==", "MzI1MjYzNDQ3NA==", "MzA5MDkzNDA0NQ==", "Mzg2MDg0NjkwNw==", "MzI3MTY5NTExNA==", "MzI1MzY1Njg5MQ==", "MzIxOTQwNjE2MQ==",
"Mzg3OTI0NTYzMA==", "MzA3NTU5NjM2MA==", "", "Mzg2NDgyMDM3OA==","","MzA5NTQ5MjY4Nw==", "", "", "", "", "", "", "", "", "", "", "", ],
# "供总院稿数": [], "供总院专稿数": [], "供总院组稿数": [], "供总院阅读10000及以上数": [], "供总院阅读5000及以上数": [], "供总院阅读1000及以上数": [],
# "1月发布数": [], "1月最高点击文章": [],
# "2月发布数": [], "2月最高点击文章": [],
# "3月发布数": [], "3月最高点击文章": [],
# "4月发布数": [], "4月最高点击文章": [],
# "5月发布数": [], "5月最高点击文章": [],
# "6月发布数": [], "6月最高点击文章": [],
# "7月发布数": [], "7月最高点击文章": [],
# "8月发布数": [], "8月最高点击文章": [],
# "9月发布数": [], "9月最高点击文章": [],
# "10月发布数": [], "10月最高点击文章": [],
# "11月发布数": [], "11月最高点击文章": [],
# "12月发布数": [], "12月最高点击文章": [],
# "半年发布数": [], "半年最高点击文章": [],
# "全年发布数": [], "全年最高点击文章": []
})
# 查询所有指定公众号的文章并按年/月排序
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
query_gzhs = f'''
SELECT
id,
g.biz as gbiz,
strftime('%Y', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_year,
strftime('%m', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_month,
strftime('%d', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_day,
g.nickname,
a.title,
a.content_url,
a.read_num
FROM
articles a
LEFT JOIN
gzhs g ON g.biz = a.biz
WHERE
pub_year = '{year}'
AND
g.biz in ({', '.join([f"'{biz}'" for biz in df_fx["公众号Biz"].tolist()])})
ORDER BY
pub_year, pub_month, pub_day;
'''
df = pd.read_sql_query(query_gzhs, conn)
conn.close
# 追加总院数据来源
for ind, row in df.iterrows():
if row['gbiz'] == zybiz:
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
try:
with open(full_path, encoding='utf-8') as f:
content = f.read()
# 从content中获取来源
a_match = re.findall('来源丨(.*?)\n', content)
if a_match:
# a = a_match[0].replace('\xa0', '、').replace(' ', '、')
# a = re.sub(r'、+', '、', a)
a = re.sub(r'[\xa0\s]+', '', a_match[0].strip())
df.at[ind, 'source'] = a
except FileNotFoundError:
print(full_path + '---不存在')
df['source'] = df['source'].fillna('')
# df_fx['供总院稿数'] = df_fx['单位'].apply(lambda unit: (df['source'].str.contains(unit)).sum())
# df_fx['供总院专稿数'] = df_fx['单位'].apply(lambda unit: (df['source'] == unit).sum())
# df_fx['供总院组稿数'] = df_fx['单位'].apply(lambda unit: ((df['source'].str.contains(unit)&(df['source']!=unit))).sum())
# df_fx['供总院阅读10000及以上数'] = df_fx['单位'].apply(lambda unit: ((df['read_num']>=10000)&(df['source'].str.contains(unit))).sum())
# df_fx['供总院阅读5000及以上数'] = df_fx['单位'].apply(lambda unit: ((df['read_num']>=5000)&(df['read_num']<10000)&(df['source'].str.contains(unit))).sum())
# df_fx['供总院阅读1000及以上数'] = df_fx['单位'].apply(lambda unit: ((df['read_num']>=1000)&(df['read_num']<5000)&(df['source'].str.contains(unit))).sum())
# df_fx['1月发布数'] = df_fx['公众号Biz'].apply(lambda unit: ((df['pub_year']==year)&(df['pub_month']=='01')&(df['gbiz']==unit)).sum() if unit else '')
# 更详细的分析
for ind, row in df_fx.iterrows():
dw = row['单位']
gbiz = row['公众号Biz']
# 全年对总院供给统计
# if '、' in dw: # 针对这种同一部门的
# cons = (df['gbiz']==zybiz)
# cons_dw_1 = pd.Series(False, index=df.index)
# for item in dw.split('、'):
# cons_dw_1 = (df['source'].str.contains(item))|cons_dw_1
# df_fx.at[ind, '供总院全年稿数'] = ((cons_dw_1)&(cons)).sum()
# else:
# df_fx.at[ind, '供总院全年稿数'] = ((df['source'].str.contains(dw))&(df['gbiz']==zybiz)).sum()
df_fx.at[ind, '供总院全年专稿数'] = ((df['source'] == dw)&(df['gbiz']==zybiz)).sum()
df_fx.at[ind, '供总院全年组稿数'] = ((df['source'].str.contains(dw)&(df['source']!=dw))&(df['gbiz']==zybiz)).sum()
df_fx.at[ind, '供总院全年阅读10000及以上数'] = ((df['read_num']>=10000)&(df['source'].str.contains(dw))&(df['gbiz']==zybiz)).sum()
df_fx.at[ind, '供总院全年阅读5000及以上数'] = ((df['read_num']>=5000)&(df['read_num']<10000)&(df['source'].str.contains(dw))&(df['gbiz']==zybiz)).sum()
df_fx.at[ind, '供总院全年阅读1000及以上数'] = ((df['read_num']>=1000)&(df['read_num']<5000)&(df['source'].str.contains(dw))&(df['gbiz']==zybiz)).sum()
for i in ['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月', '上半年', '下半年', '全年']:
if '' in i:
i_str = i.replace('', '').zfill(2)
cons_y_m = (df['pub_month']==str(i_str))
elif i == '上半年':
cons_y_m = (df['pub_month'] =='01')|(df['pub_month'] =='02')|(df['pub_month'] =='03')|(df['pub_month'] =='04')|(df['pub_month'] =='05')|(df['pub_month'] =='06')
elif i == '下半年':
cons_y_m = (df['pub_month'] =='07')|(df['pub_month'] =='08')|(df['pub_month'] =='09')|(df['pub_month'] =='10')|(df['pub_month'] =='11')|(df['pub_month'] =='12')
elif i == '全年':
cons_y_m = pd.Series(True, index=df.index)
if '' in dw: # 针对这种同一部门的
cons_dw_1 = pd.Series(False, index=df.index)
for item in dw.split(''):
cons_dw_1 = (df['source'].str.contains(item))|cons_dw_1
df_fx.at[ind, f'供总院{i}稿数'] = ((cons_dw_1)&(cons_y_m)&(df['gbiz']==zybiz)).sum()
else:
df_fx.at[ind, f'供总院{i}稿数'] = (df['source'].str.contains(dw)&(cons_y_m)&(df['gbiz']==zybiz)).sum()
df_fx[f'供总院{i}稿数'] = df_fx[f'供总院{i}稿数'].fillna(0)
df_fx[f'供总院{i}稿数'] = df_fx[f'供总院{i}稿数'].astype(int)
if gbiz:
# 进行查询
# 条件
cons = (cons_y_m)&(df['gbiz']==gbiz)
cons_sum = (cons).sum()
df_fx.at[ind, f'{i}发布数'] = cons_sum
df_fx[f'{i}发布数'] = df_fx[f'{i}发布数'].fillna(0)
df_fx[f'{i}发布数'] = df_fx[f'{i}发布数'].astype(int)
df_fx.at[ind, f'{i}最高点击文章'] = ''
if cons_sum:
max_read_row = df[cons].loc[df[cons]['read_num'].idxmax()]
max_read_row_list = [max_read_row['id'], max_read_row['title'], str(max_read_row['read_num']), f'{max_read_row["pub_year"]}-{max_read_row["pub_month"]}-{max_read_row["pub_day"]}', max_read_row['source']]
df_fx.at[ind, f'{i}最高点击文章'] = '***'.join(max_read_row_list)
# 矫正数据类型
df_fx = df_fx.applymap(float_to_int)
# 先输出原始统计数据
origin_path = os.path.join(BASE_DIR, f'summary/{year}_fx.xlsx')
df_fx.to_excel(origin_path, index=True)
# 追加到总院年分析表中
template_path = os.path.join(BASE_DIR, 'summary/template_cbma.xlsx')
workbook = load_workbook(template_path)
sheet = workbook['公众号更新数']
sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(官微)')
ind_zy = 0
for ind, row in df.iterrows():
if row['gbiz'] == zybiz:
sheet.cell(row=ind_zy+3, column=1, value=str(ind_zy+1))
sheet.cell(row=ind_zy+3, column=2, value=f'{row["pub_year"]}-{row["pub_month"]}-{row["pub_day"]}')
sheet.cell(row=ind_zy+3, column=3, value=row['title'])
sheet.cell(row=ind_zy+3, column=4, value=row['source'])
sheet.cell(row=ind_zy+3, column=6, value=row['read_num'])
sheet.cell(row=ind_zy+3, column=7, value=row['id'])
sheet.cell(row=ind_zy+3, column=8, value=row['content_url'])
ind_zy = ind_zy + 1
cbma_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma.xlsx')
workbook.save(cbma_path)
print(f'总院{year}年文章表生成完毕!')
template_cal_path = os.path.join(BASE_DIR, 'summary/tempalte_cbma_cal.xlsx')
workbook2 = load_workbook(template_cal_path)
need_df_list = [ "瑞泰科技", "国检集团", "中材高新", "哈玻院", "中国新材院", "秦皇岛院", "西安墙材院", "咸阳陶瓷院", "钟表所", "总院北分", "中岩科技", "水泥新材院", "中建材科创院", "科建苑"]
sheet2= workbook2['打分表']
sheet2.cell(row=1, column=1, value=f'中国建材总院宣传工作计分表({year}年度)')
for ind, val in enumerate(need_df_list):
row_ind_df_fx = df_fx['单位'].to_list().index(val)
sheet2.cell(row=6, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年专稿数'])
sheet2.cell(row=10, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年组稿数'])
sheet2.cell(row=12, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年阅读10000及以上数'])
sheet2.cell(row=13, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年阅读5000及以上数'])
sheet2.cell(row=14, column=5+2*ind, value=df_fx.at[row_ind_df_fx, '供总院全年阅读1000及以上数'])
cbma_cal_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma_cal.xlsx')
workbook2.save(cbma_cal_path)
print(f'总院{year}年打分表生成完毕!')
# need_df_list_full = [ "瑞泰科技股份有限公司", "中国国检测试控股集团股份有限公司", "中材高新材料股份有限公司", "哈尔滨玻璃钢研究院有限公司", "中国新型建材设计研究院有限公司", "秦皇岛玻璃工业研究设计院有限公司", "西安墙体材料研究设计院有限公司", "咸阳陶瓷研究设计院有限公司", "西安轻工业钟表研究所有限公司", "中国建材总院北京分公司", "中建材中岩科技有限公司", "水泥科学与新型建筑材料研究院(中研益)", "中建材科创新技术研究院(山东)有限公司", "北京科建苑物业管理有限公司"]
dw_list = df_fx['单位'].to_list()
template_month_path = os.path.join(BASE_DIR, 'summary/template_month.xlsx')
workbook3 = load_workbook(template_month_path)
for i in ['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月', '上半年', '下半年', '全年']:
try:
sheet= workbook3[i]
except KeyError:
sheet = workbook3.copy_worksheet(workbook3['1月'])
sheet.title = i
sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院各企业新媒体更新情况统计表\n{i}')
# 开始总院填充数据
sheet.cell(row=4, column=3, value=df_fx.at[0, f'{i}发布数'])
max_read_row = df_fx.at[dw_list.index('中国建材总院'), f'{i}最高点击文章']
if max_read_row:
_, title, read_num, pub_date, source = max_read_row.split('***')
sheet.cell(row=7, column=2, value=title)
sheet.cell(row=7, column=4, value=read_num)
sheet.cell(row=7, column=5, value=pub_date)
sheet.cell(row=7, column=6, value=source)
# 开始填充各单位数据
sheet.cell(row=14, column=3, value=df_fx.at[dw_list.index('瑞泰科技'), f'{i}发布数'])
sheet.cell(row=14, column=6, value=df_fx.at[dw_list.index('瑞泰科技'), f'供总院{i}稿数'])
sheet.cell(row=15, column=3, value=df_fx.at[dw_list.index('国检集团'), f'{i}发布数'])
sheet.cell(row=15, column=6, value=df_fx.at[dw_list.index('国检集团'), f'供总院{i}稿数'])
sheet.cell(row=16, column=3, value=df_fx.at[dw_list.index('中材高新'), f'{i}发布数'])
sheet.cell(row=16, column=6, value=df_fx.at[dw_list.index('中材高新'), f'供总院{i}稿数'])
sheet.cell(row=17, column=3, value=df_fx.at[dw_list.index('哈玻院'), f'{i}发布数'])
sheet.cell(row=17, column=6, value=df_fx.at[dw_list.index('哈玻院'), f'供总院{i}稿数'])
sheet.cell(row=18, column=3, value=df_fx.at[dw_list.index('中国新材院'), f'{i}发布数'])
sheet.cell(row=18, column=6, value=df_fx.at[dw_list.index('中国新材院'), f'供总院{i}稿数'])
sheet.cell(row=19, column=3, value=df_fx.at[dw_list.index('秦皇岛院'), f'{i}发布数'])
sheet.cell(row=19, column=6, value=df_fx.at[dw_list.index('秦皇岛院'), f'供总院{i}稿数'])
sheet.cell(row=20, column=3, value=df_fx.at[dw_list.index('西安墙材院'), f'{i}发布数'])
sheet.cell(row=20, column=6, value=df_fx.at[dw_list.index('西安墙材院'), f'供总院{i}稿数'])
sheet.cell(row=21, column=3, value=df_fx.at[dw_list.index('咸阳陶瓷院'), f'{i}发布数'])
sheet.cell(row=21, column=6, value=df_fx.at[dw_list.index('咸阳陶瓷院'), f'供总院{i}稿数'])
sheet.cell(row=22, column=3, value=df_fx.at[dw_list.index('钟表所'), f'{i}发布数'])
sheet.cell(row=22, column=6, value=df_fx.at[dw_list.index('钟表所'), f'供总院{i}稿数'])
# sheet.cell(row=23, column=3, value=df_fx.at[dw_list.index('总院北分'), f'{i}发布数'])
sheet.cell(row=23, column=6, value=df_fx.at[dw_list.index('总院北分'), f'供总院{i}稿数'])
sheet.cell(row=24, column=3, value=df_fx.at[dw_list.index('中岩科技'), f'{i}发布数'])
sheet.cell(row=24, column=6, value=df_fx.at[dw_list.index('中岩科技'), f'供总院{i}稿数'])
# sheet.cell(row=25, column=3, value=df_fx.at[dw_list.index('水泥新材院'), f'{i}发布数'])
sheet.cell(row=25, column=6, value=df_fx.at[dw_list.index('水泥新材院'), f'供总院{i}稿数'])
# sheet.cell(row=26, column=3, value=df_fx.at[dw_list.index('中建材科创院'), f'{i}发布数'])
sheet.cell(row=26, column=6, value=df_fx.at[dw_list.index('中建材科创院'), f'供总院{i}稿数'])
# sheet.cell(row=27, column=3, value=df_fx.at[dw_list.index('科建苑'), f'{i}发布数'])
sheet.cell(row=27, column=6, value=df_fx.at[dw_list.index('科建苑'), f'供总院{i}稿数'])
sheet.cell(row=29, column=2, value=df_fx.at[dw_list.index('办公室(董事会办公室)'), f'供总院{i}稿数'])
sheet.cell(row=30, column=2, value=df_fx.at[dw_list.index('党委组织部/人力资源部'), f'供总院{i}稿数'])
sheet.cell(row=31, column=2, value=df_fx.at[dw_list.index('财务部'), f'供总院{i}稿数'])
sheet.cell(row=32, column=2, value=df_fx.at[dw_list.index('科技部'), f'供总院{i}稿数'])
sheet.cell(row=33, column=2, value=df_fx.at[dw_list.index('投资部'), f'供总院{i}稿数'])
sheet.cell(row=29, column=7, value=df_fx.at[dw_list.index('企业管理部、安全环保部'), f'供总院{i}稿数'])
sheet.cell(row=30, column=7, value=df_fx.at[dw_list.index('党群部/宣传统战部'), f'供总院{i}稿数'])
sheet.cell(row=31, column=7, value=df_fx.at[dw_list.index('党风办/巡察办、纪委综合室'), f'供总院{i}稿数'])
sheet.cell(row=32, column=7, value=df_fx.at[dw_list.index('监督执纪室'), f'供总院{i}稿数'])
sheet.cell(row=33, column=7, value=df_fx.at[dw_list.index('审计办公室'), f'供总院{i}稿数'])
cbma_month_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma_month.xlsx')
workbook3.save(cbma_month_path)
print(f'总院{year}年月度表生成完毕!')
return origin_path, cbma_path, cbma_cal_path, cbma_month_path
def make_wechat_articles_full():
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
df['content'] = ''
for ind, row in df.iterrows():
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
try:
with open(full_path, encoding='utf-8') as f:
df.at[ind, 'content'] = f.read()
except FileNotFoundError:
print(full_path + '---不存在')
output_path = os.path.join(wechat_dir, 'articles_full.csv')
df.to_csv(output_path)
def ana_wechat():
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
if not os.path.exists(articles_full_path):
make_wechat_articles_full()
df = pd.read_csv(articles_full_path)
df['content'] = df['content'].fillna('')
output_data = []
index = 1
for ind, row in df_s.iterrows():
mask = df['content'].str.contains(row['错误表述'])
result = df[mask]
if not result.empty:
for ind2, row2 in result.iterrows():
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
continue
if row['错误表述'] == '20大':
continue
output_row = [
index,
row2['nickname'],
row2['title'],
row['错误表述'],
row['建议修改词语'],
row['错误分类'],
row2['content_url']
]
output_data.append(output_row)
index += 1
print(f'找到公众号问题{index}---{row2["nickname"]}')
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data
def ana_web():
output_data = []
index = 1
# for file in os.listdir(web_dir):
# full_path = os.path.join(web_dir, file)
# if '$' in full_path:
# continue
# print(full_path)
# 只分析websites中的
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = fix_url_scheme(row['地址'].strip())
domain = urlparse(url).netloc.replace('www.', '')
full_path = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
for ind, row in df_s.iterrows():
mask = df['text'].str.contains(row['错误表述'], na=False)
result = df[mask]
if not result.empty:
for ind2, row2 in result.iterrows():
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
continue
if row['错误表述'] == '20大':
continue
output_row = [
index,
row2['name'],
"/",
row['错误表述'],
row['建议修改词语'],
row['错误分类'],
row2['url']
]
output_data.append(output_row)
index += 1
print(f'找到官网问题{index}---{row2["name"]}')
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data
if __name__ == "__main__":
get_cbma_info_from_db_and_ana()