zcspider/web3.py

119 lines
4.9 KiB
Python

import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
import signal
import sys
import datetime
from openpyxl import load_workbook
from mycode.base import BASE_DIR
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file
python_exe = os.path.join(BASE_DIR, 'runtime/python.exe')
# scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe')
def save_info_to_excel(info_list, output_filename):
df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
df.to_excel(output_filename, index=False)
# 定义 SIGINT 信号处理函数
def sigint_handler(signal, frame):
print('收到 Ctrl-C 信号,正在关闭子进程...')
for process in processes:
process.terminate()
print('子进程已关闭,程序退出。')
sys.exit(0)
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
# 如果没有方案,添加默认方案
url = f'{default_scheme}://{url}'
return url
if __name__ == '__main__':
# print('巡查任务开始。。。')
# now = datetime.datetime.now()
# month = now.month
# print('正在组合微信公众号爬取内容。。。')
# make_simple_csv_from_db()
# make_wechat_articles_full()
# print('公众号爬取内容组装完毕!')
# print('开始进行网站爬取。。。')
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
# processes = []
# # 注册 SIGINT 信号处理函数
# signal.signal(signal.SIGINT, sigint_handler)
# ind = 0
# for ind, row in df.iterrows():
# group = row['单位']
# name = row['主办']
# url = fix_url_scheme(row['地址'].strip())
# domain = urlparse(url).netloc.replace('www.', '')
# if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
# continue
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
# process = subprocess.Popen(cmd)
# processes.append(process)
# # Wait for all processes to finish
# for process in processes:
# process.wait()
print('网站爬取结束,校验中。。。')
# Check output file sizes and save information if size is less than 20KB
info_to_save = []
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace("www.", "")
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
print(file_size/1024)
if file_size < 20 * 1024: # Convert KB to bytes
info_to_save.append([group, name, url])
if info_to_save:
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
chrom_main_from_list(info_to_save)
# os.remove(failed_sites_file)
print('网站爬取完毕!')
# print('开始对比分析所有内容。。。')
# # Run WeChat Analysis
# wechat_results = ana_wechat()
# # Run Web Content Analysis
# web_results = ana_web()
# # Save results in an Excel file with two sheets
# output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
# # with pd.ExcelWriter(output_excel_path) as writer:
# # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# # df.to_excel(writer, sheet_name='公众号', index=False)
# # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# # df2.to_excel(writer, sheet_name='网站', index=False)
# template_path = os.path.join(output_dir, 'template.xlsx')
# workbook = load_workbook(template_path)
# # 选择要操作的工作表
# wechat_sheet = workbook['公众号']
# web_sheet = workbook['网站']
# for row in wechat_results:
# wechat_sheet.append(row)
# for row in web_results:
# web_sheet.append(row)
# workbook.save(output_excel_path)
# workbook.close()
# print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
# os.system("pause")