zcspider/main.py

import pandas as pd
import os
import html2text

current_dir = os.getcwd()
wechat_dir = os.path.join(current_dir, 'wechat_dir')
web_dir = os.path.join(current_dir, 'web_dir')
output_dir = os.path.join(current_dir, 'summary')
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')

def trans_to_json():
    json_str = df_s.to_json(orient='records', force_ascii=False)
    with open('biao.json', 'w', encoding='utf-8') as f:
        f.write(json_str)

def make_wechat_articles_full():
    df =  pd.read_excel(os.path.join(wechat_dir, 'articles.xlsx'))
    df['content'] = ''
    for ind, row in df.iterrows():
        full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
        try:
            with open(full_path, encoding='utf-8') as f:
                h = html2text.HTML2Text()
                h.ignore_links = True
                df.at[ind, 'content'] = h.handle(f.read())
            print(f'{ind}--{row["nickname"]}--{row["title"]}')
        except:
            print(full_path + '---不存在')
    output_path = os.path.join(wechat_dir, 'articles_full.csv')
    df.to_csv(output_path, encoding='utf-8_sig')

def ana_wechat():
    articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
    if not os.path.exists(articles_full_path):
        make_wechat_articles_full()

    df = pd.read_csv(articles_full_path)
    df['content'] = df['content'].fillna('')

    output_data = []
    index = 1

    for ind, row in df_s.iterrows():
        mask = df['content'].str.contains(row['错误表述'])
        result = df[mask]

        if not result.empty:
            for ind2, row2 in result.iterrows():
                output_row = [
                    index,
                    row2['nickname'],
                    row2['title'],
                    row['错误表述'],
                    row['建议修改词语'],
                    row['错误分类'],
                    row2['content_url']
                ]
                output_data.append(output_row)
                index += 1

    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])

    return output_data


def ana_web():
    output_data = []
    index = 1

    for file in os.listdir(web_dir):
        full_path = os.path.join(web_dir, file)
        if os.path.getsize(full_path) > 0:
            df = pd.read_excel(os.path.join(web_dir, file))
            for ind, row in df_s.iterrows():
                mask = df['text'].str.contains(row['错误表述'], na=False)
                result = df[mask]
                if not result.empty:
                    for ind2, row2 in result.iterrows():
                        output_row = [
                            index,
                            row2['name'],
                            "文章标题",
                            row['错误表述'],
                            row['建议修改词语'],
                            row['错误分类'],
                            row2['content_url']
                        ]
                        output_data.append(output_row)
                        index += 1

    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])

    return output_data

# Run WeChat Analysis
wechat_results = ana_wechat()

# Run Web Content Analysis
web_results = ana_web()

# Save results in an Excel file with two sheets
output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
with pd.ExcelWriter(output_excel_path) as writer:
    wechat_results.to_excel(writer, sheet_name='公众号', index=False)
    web_results.to_excel(writer, sheet_name='网站', index=False)

print("Analysis completed and results saved to Excel.")