zcspider/main.py

import pandas as pd
import os
import html2text
import sys
import sqlite3

current_dir = os.getcwd()
wechat_dir = os.path.join(current_dir, 'article')
web_dir = os.path.join(current_dir, 'web_dir')
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')

def trans_to_json():
    json_str = df_s.to_json(orient='records', force_ascii=False)
    with open('biao.json', 'w', encoding='utf-8') as f:
        f.write(json_str)

def make_simple_csv_from_db():
    conn = sqlite3.connect('db_folder/test.db')
    query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
    df = pd.read_sql_query(query, conn)
    # 关闭数据库连接
    conn.close()
    # 将数据写入CSV文件
    df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)


def make_wechat_articles_full():
    df =  pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
    df['content'] = ''
    ind = 0
    for ind, row in df.iterrows():
        full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
        try:
            with open(full_path, encoding='utf-8') as f:
                df.at[ind, 'content'] = f.read()
        except FileNotFoundError:
            print(full_path + '---不存在')
        ind +=1
    output_path = os.path.join(wechat_dir, 'articles_full.csv')
    df.to_csv(output_path)

def ana_wechat():
    articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
    if not os.path.exists(articles_full_path):
        make_wechat_articles_full()
    df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
    df = pd.read_csv(articles_full_path)
    df['content'] = df['content'].fillna('')
    ind = 0
    need_save = False
    for ind, row in df_s.iterrows():
        mask = df['content'].str.contains(row['错误表述'])
        result = df[mask]
        if result.empty:
            continue
        ind2 = 0
        for ind2, row2 in result.iterrows():
            alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
            df_a.loc[len(df_a.index)] = alist
            if need_save is False:
                need_save = True
            ind2 +=1
        ind +=1
    if need_save:
        df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')

def ana_web():
    df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
    need_save = False
    for file in os.listdir(web_dir):
        full_path = os.path.join(web_dir, file)
        if os.path.getsize(full_path) > 0:
            df = pd.read_excel(os.path.join(web_dir, file))
            ind = 0
            for ind, row in df_s.iterrows():
                mask = df['text'].str.contains(row['错误表述'], na=False)
                result = df[mask]
                if result.empty:
                    continue
                ind2 = 0
                for ind2, row2 in result.iterrows():
                    alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
                    print(alist)
                    df_a.loc[len(df_a.index)] = alist
                    if need_save is False:
                        need_save = True
                    ind2 +=1
                ind +=1
    if need_save:
        df_a.to_csv('ana_web.csv', encoding='utf-8_sig')

if __name__ == "__main__":
    if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
        ana_wechat()
    else:
        ana_web()