Merge branch 'master' of https://e.coding.net/ctcdevteam/zcspider/zcspider
This commit is contained in:
commit
d2a1dd165d
93
main.py
93
main.py
|
@ -1,12 +1,11 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import html2text
|
||||
import sys
|
||||
import sqlite3
|
||||
|
||||
current_dir = os.getcwd()
|
||||
wechat_dir = os.path.join(current_dir, 'article')
|
||||
web_dir = os.path.join(current_dir, 'web_dir')
|
||||
output_dir = os.path.join(current_dir, 'summary')
|
||||
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
|
||||
|
||||
def trans_to_json():
|
||||
|
@ -27,7 +26,6 @@ def make_simple_csv_from_db():
|
|||
def make_wechat_articles_full():
|
||||
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
|
||||
df['content'] = ''
|
||||
ind = 0
|
||||
for ind, row in df.iterrows():
|
||||
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
|
||||
try:
|
||||
|
@ -35,7 +33,6 @@ def make_wechat_articles_full():
|
|||
df.at[ind, 'content'] = f.read()
|
||||
except FileNotFoundError:
|
||||
print(full_path + '---不存在')
|
||||
ind +=1
|
||||
output_path = os.path.join(wechat_dir, 'articles_full.csv')
|
||||
df.to_csv(output_path)
|
||||
|
||||
|
@ -43,55 +40,75 @@ def ana_wechat():
|
|||
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
|
||||
if not os.path.exists(articles_full_path):
|
||||
make_wechat_articles_full()
|
||||
df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
|
||||
|
||||
df = pd.read_csv(articles_full_path)
|
||||
df['content'] = df['content'].fillna('')
|
||||
ind = 0
|
||||
need_save = False
|
||||
|
||||
output_data = []
|
||||
index = 1
|
||||
|
||||
for ind, row in df_s.iterrows():
|
||||
mask = df['content'].str.contains(row['错误表述'])
|
||||
result = df[mask]
|
||||
if result.empty:
|
||||
continue
|
||||
ind2 = 0
|
||||
|
||||
if not result.empty:
|
||||
for ind2, row2 in result.iterrows():
|
||||
alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
||||
df_a.loc[len(df_a.index)] = alist
|
||||
if need_save is False:
|
||||
need_save = True
|
||||
ind2 +=1
|
||||
ind +=1
|
||||
if need_save:
|
||||
df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
|
||||
output_row = [
|
||||
index,
|
||||
row2['nickname'],
|
||||
row2['title'],
|
||||
row['错误表述'],
|
||||
row['建议修改词语'],
|
||||
row['错误分类'],
|
||||
row2['content_url']
|
||||
]
|
||||
output_data.append(output_row)
|
||||
index += 1
|
||||
|
||||
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||
|
||||
return output_data
|
||||
|
||||
|
||||
def ana_web():
|
||||
df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
|
||||
need_save = False
|
||||
output_data = []
|
||||
index = 1
|
||||
|
||||
for file in os.listdir(web_dir):
|
||||
full_path = os.path.join(web_dir, file)
|
||||
if os.path.getsize(full_path) > 0:
|
||||
df = pd.read_excel(os.path.join(web_dir, file))
|
||||
ind = 0
|
||||
for ind, row in df_s.iterrows():
|
||||
mask = df['text'].str.contains(row['错误表述'], na=False)
|
||||
result = df[mask]
|
||||
if result.empty:
|
||||
continue
|
||||
ind2 = 0
|
||||
if not result.empty:
|
||||
for ind2, row2 in result.iterrows():
|
||||
alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
||||
print(alist)
|
||||
df_a.loc[len(df_a.index)] = alist
|
||||
if need_save is False:
|
||||
need_save = True
|
||||
ind2 +=1
|
||||
ind +=1
|
||||
if need_save:
|
||||
df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
|
||||
output_row = [
|
||||
index,
|
||||
row2['name'],
|
||||
"文章标题",
|
||||
row['错误表述'],
|
||||
row['建议修改词语'],
|
||||
row['错误分类'],
|
||||
row2['content_url']
|
||||
]
|
||||
output_data.append(output_row)
|
||||
index += 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
|
||||
ana_wechat()
|
||||
else:
|
||||
ana_web()
|
||||
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||
|
||||
return output_data
|
||||
|
||||
# Run WeChat Analysis
|
||||
wechat_results = ana_wechat()
|
||||
|
||||
# Run Web Content Analysis
|
||||
web_results = ana_web()
|
||||
|
||||
# Save results in an Excel file with two sheets
|
||||
output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
||||
with pd.ExcelWriter(output_excel_path) as writer:
|
||||
wechat_results.to_excel(writer, sheet_name='公众号', index=False)
|
||||
web_results.to_excel(writer, sheet_name='网站', index=False)
|
||||
|
||||
print("Analysis completed and results saved to Excel.")
|
||||
|
|
|
@ -138,19 +138,24 @@ def main():
|
|||
name = row['主办']
|
||||
url = row['地址']
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
# Open the website
|
||||
driver = open_website(url)
|
||||
|
||||
# Retrieve cookies from previous session
|
||||
cookies = get_cookies_from_previous_session(driver)
|
||||
# Add cookies to the WebDriver
|
||||
add_cookies(driver, cookies)
|
||||
|
||||
# Initialize the set to store visited pages
|
||||
visited_pages = set()
|
||||
# Initialize the data list
|
||||
data = []
|
||||
|
||||
# Process the starting page and follow hyperlinks recursively
|
||||
process_page(driver, url, visited_pages, domain, data)
|
||||
# Export data to a separate Excel file for each URL
|
||||
|
||||
# Export data to a separate Excel file in the web_dir directory
|
||||
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
||||
export_to_excel(data, output_filename)
|
||||
|
||||
|
|
Loading…
Reference in New Issue