feat: 优化爬虫/输出分析报告
This commit is contained in:
parent
e5ff671c97
commit
7bf87671b9
2
main.ui
2
main.ui
|
@ -478,7 +478,7 @@
|
|||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>20</string>
|
||||
<string>5</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bStart">
|
||||
|
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
|||
import pandas as pd
|
||||
from .base import BASE_DIR
|
||||
import os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
|
||||
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
|
||||
|
||||
|
@ -160,7 +161,6 @@ def add_cookies(driver, cookies):
|
|||
driver.add_cookie({'name': name, 'value': value})
|
||||
|
||||
def chrom_main_from_list(sites):
|
||||
driver = init_driver()
|
||||
for ind, item in enumerate(sites):
|
||||
group = item[0] # Replace with the actual column name for group
|
||||
name = item[1]
|
||||
|
@ -169,29 +169,40 @@ def chrom_main_from_list(sites):
|
|||
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']:
|
||||
continue
|
||||
url = fix_url_scheme(url)
|
||||
print(url)
|
||||
driver = init_driver()
|
||||
# Open the website
|
||||
driver.get(url)
|
||||
# driver.get(url)
|
||||
|
||||
# Retrieve cookies from previous session
|
||||
cookies = get_cookies_from_previous_session(driver)
|
||||
# Add cookies to the WebDriver
|
||||
add_cookies(driver, cookies)
|
||||
# # Retrieve cookies from previous session
|
||||
# cookies = get_cookies_from_previous_session(driver)
|
||||
# # Add cookies to the WebDriver
|
||||
# add_cookies(driver, cookies)
|
||||
|
||||
# Initialize the set to store visited pages
|
||||
visited_pages = set()
|
||||
# Initialize the data list
|
||||
data = []
|
||||
try:
|
||||
# 设置页面加载超时时间为10秒
|
||||
driver.set_page_load_timeout(10)
|
||||
|
||||
# 设置脚本执行超时时间为10秒
|
||||
driver.set_script_timeout(10)
|
||||
|
||||
# Process the starting page and follow hyperlinks recursively
|
||||
process_page(driver, url, visited_pages, domain, data, group, name)
|
||||
# 在这里编写你的代码,例如打开网页、点击按钮等操作
|
||||
# ...
|
||||
process_page(driver, url, visited_pages, domain, data, group, name)
|
||||
except TimeoutException:
|
||||
# 当超时异常发生时,进行相应的操作,例如跳过或报错
|
||||
print("超时异常")
|
||||
driver.quit()
|
||||
|
||||
# Export data to a separate Excel file in the web_dir directory
|
||||
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
export_to_excel(data, output_filename)
|
||||
|
||||
# Close the WebDriver
|
||||
driver.quit()
|
||||
# driver.quit()
|
||||
|
||||
def chrome_main():
|
||||
# Read failed URLs from the list
|
||||
|
|
|
@ -2,6 +2,8 @@ import pandas as pd
|
|||
import os
|
||||
import sqlite3
|
||||
from mycode.base import BASE_DIR
|
||||
import re
|
||||
from openpyxl import load_workbook
|
||||
|
||||
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||
|
@ -23,6 +25,98 @@ def make_simple_csv_from_db():
|
|||
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
|
||||
|
||||
|
||||
def get_cbma_info_from_db_and_ana(year: str = '2023'):
|
||||
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
|
||||
query = f'''
|
||||
SELECT
|
||||
id,
|
||||
strftime('%Y年%m月%d日', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_date,
|
||||
g.nickname,
|
||||
a.title,
|
||||
a.content_url,
|
||||
a.read_num
|
||||
FROM
|
||||
articles a
|
||||
LEFT JOIN
|
||||
gzhs g ON g.biz = a.biz
|
||||
WHERE
|
||||
pub_date > '{year}'
|
||||
AND
|
||||
g.biz = 'MzIzMDU4Njg3MA=='
|
||||
ORDER BY
|
||||
pub_date
|
||||
'''
|
||||
df = pd.read_sql_query(query, conn)
|
||||
# 关闭数据库连接
|
||||
conn.close()
|
||||
for ind, row in df.iterrows():
|
||||
id = row['id']
|
||||
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
|
||||
try:
|
||||
with open(full_path, encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
# 从content中获取来源
|
||||
a_match = re.findall('来源丨(.*?)\n', content)
|
||||
a_list = []
|
||||
if a_match:
|
||||
# a = a_match[0].replace('\xa0', '、').replace(' ', '、')
|
||||
# a = re.sub(r'、+', '、', a)
|
||||
a = re.sub(r'[\xa0\s]+', '、', a_match[0])
|
||||
df.at[ind, 'source'] = a
|
||||
except FileNotFoundError:
|
||||
print(full_path + '---不存在')
|
||||
# 填充到execl中
|
||||
template_path = os.path.join(BASE_DIR, 'summary/template_cbma.xlsx')
|
||||
workbook = load_workbook(template_path)
|
||||
sheet = workbook['公众号更新数']
|
||||
sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(官微)')
|
||||
for ind, row in df.iterrows():
|
||||
sheet.cell(row=ind+3, column=1, value=str(ind+1))
|
||||
sheet.cell(row=ind+3, column=2, value=row['pub_date'])
|
||||
sheet.cell(row=ind+3, column=3, value=row['title'])
|
||||
sheet.cell(row=ind+3, column=4, value=row['source'])
|
||||
sheet.cell(row=ind+3, column=6, value=row['read_num'])
|
||||
sheet.cell(row=ind+3, column=7, value=row['content_url'])
|
||||
output_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma.xlsx')
|
||||
workbook.save(output_path)
|
||||
# 开始统计分数
|
||||
t_1 = (df['source'].str.contains('瑞泰科技')).sum()
|
||||
t_2 = (df['source'].str.contains('国检集团')).sum()
|
||||
t_3 = (df['source'].str.contains('中材高新')).sum()
|
||||
t_4 = (df['source'].str.contains('哈玻院')).sum()
|
||||
t_5 = (df['source'].str.contains('中国新材院')).sum()
|
||||
t_6 = (df['source'].str.contains('秦皇岛院')).sum()
|
||||
t_7 = (df['source'].str.contains('西安墙材院')).sum()
|
||||
t_8 = (df['source'].str.contains('咸阳陶瓷院')).sum()
|
||||
t_9 = (df['source'].str.contains('钟表所')).sum()
|
||||
t_10 = (df['source'].str.contains('总院北分')).sum()
|
||||
t_11 = (df['source'].str.contains('中岩科技')).sum()
|
||||
t_12 = (df['source'].str.contains('水泥新材院')).sum()
|
||||
t_13 = (df['source'].str.contains('中建材科创院')).sum()
|
||||
t_14 = (df['source'].str.contains('科建苑')).sum()
|
||||
template_cal_path = os.path.join(BASE_DIR, 'summary/tempalte_cbma_cal.xlsx')
|
||||
workbook2 = load_workbook(template_cal_path)
|
||||
sheet2= workbook2['打分表']
|
||||
sheet2.cell(row=1, column=1, value=f'中国建材总院宣传工作计分表({year}年度)')
|
||||
sheet2.cell(row=6, column=5, value=t_1)
|
||||
sheet2.cell(row=6, column=7, value=t_2)
|
||||
sheet2.cell(row=6, column=9, value=t_3)
|
||||
sheet2.cell(row=6, column=11, value=t_4)
|
||||
sheet2.cell(row=6, column=13, value=t_5)
|
||||
sheet2.cell(row=6, column=15, value=t_6)
|
||||
sheet2.cell(row=6, column=17, value=t_7)
|
||||
sheet2.cell(row=6, column=19, value=t_8)
|
||||
sheet2.cell(row=6, column=21, value=t_9)
|
||||
sheet2.cell(row=6, column=23, value=t_10)
|
||||
sheet2.cell(row=6, column=25, value=t_11)
|
||||
sheet2.cell(row=6, column=27, value=t_12)
|
||||
sheet2.cell(row=6, column=29, value=t_13)
|
||||
sheet2.cell(row=6, column=31, value=t_14)
|
||||
output_path2 = os.path.join(BASE_DIR, f'summary/{year}年_cbma_cal.xlsx')
|
||||
workbook2.save(output_path2)
|
||||
return output_path, output_path2
|
||||
|
||||
|
||||
def make_wechat_articles_full():
|
||||
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
|
||||
df['content'] = ''
|
||||
|
@ -110,5 +204,5 @@ def ana_web():
|
|||
return output_data
|
||||
|
||||
if __name__ == "__main__":
|
||||
ana_web()
|
||||
get_cbma_info_from_db_and_ana()
|
||||
|
||||
|
|
77
start.py
77
start.py
|
@ -7,7 +7,7 @@ import win32com.client as win32
|
|||
import subprocess
|
||||
import os
|
||||
import datetime
|
||||
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
||||
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir, get_cbma_info_from_db_and_ana
|
||||
from mycode.crawl_chrome import chrom_main_from_list
|
||||
import pandas as pd
|
||||
from urllib.parse import urlparse
|
||||
|
@ -139,12 +139,6 @@ class MyThread(QThread):
|
|||
output = p.stdout.readline()
|
||||
if output:
|
||||
self.update_signal.emit({'msg': output.strip()})
|
||||
|
||||
def capture_err(self, p):
|
||||
while self.running and p.poll() is None:
|
||||
err = p.stderr.readline()
|
||||
if err:
|
||||
self.update_signal.emit({'msg': err.strip()})
|
||||
|
||||
def run(self) -> None:
|
||||
self.update_signal.emit({'msg': '开始进行网站爬取...'})
|
||||
|
@ -158,13 +152,14 @@ class MyThread(QThread):
|
|||
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
# -u 代表不缓冲,直接输出
|
||||
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
|
||||
# cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, shell=False)
|
||||
self.processes.append(process)
|
||||
self.running = True
|
||||
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
|
||||
getlog_thread.start()
|
||||
getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
|
||||
getlog_thread_err.start()
|
||||
# getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
|
||||
# getlog_thread_err.start()
|
||||
|
||||
for process in self.processes:
|
||||
process.wait()
|
||||
|
@ -205,12 +200,15 @@ class MainWindow(QMainWindow):
|
|||
self.ui.setupUi(self)
|
||||
self.ui.lSize.setValidator(QIntValidator())
|
||||
self.ui.bWechat.clicked.connect(self.open_wcplus)
|
||||
self.ui.bWebSite.clicked.connect(self.open_websites_xlsx)
|
||||
self.ui.bBiao.clicked.connect(self.open_biao_xlsx)
|
||||
self.ui.bWebSite.clicked.connect(lambda: self.open_file(WEB_SITES_PATH))
|
||||
self.ui.bBiao.clicked.connect(lambda: self.open_file(BIAO_PATH))
|
||||
self.ui.bStart.clicked.connect(self.start)
|
||||
self.ui.bAna.clicked.connect(self.start_ana)
|
||||
self.ui.bRes1.clicked.connect(self.open_res1)
|
||||
self.ui.bRes2.clicked.connect(self.open_res2)
|
||||
self.ui.bRes1.clicked.connect(lambda: self.open_file(self.ui.lRes1.text()))
|
||||
self.ui.bRes2.clicked.connect(lambda: self.open_file(self.ui.lRes2.text(), 'docx'))
|
||||
self.ui.bCal.clicked.connect(self.cbma_cal)
|
||||
self.ui.bOpenCalRes1.clicked.connect(lambda: self.open_file(self.ui.lCalRes1.text()))
|
||||
self.ui.bOpenCalRes2.clicked.connect(lambda: self.open_file(self.ui.lCalRes2.text()))
|
||||
self.ui.vLog.setModel(self.logModel)
|
||||
self.res1Workbook = None
|
||||
|
||||
|
@ -219,31 +217,22 @@ class MainWindow(QMainWindow):
|
|||
subprocess.Popen('.\wcplus.exe')
|
||||
self.wcplus = True
|
||||
|
||||
def open_websites_xlsx(self):
|
||||
app = win32.Dispatch("Excel.Application")
|
||||
app.Visible = True
|
||||
app.Workbooks.Open(WEB_SITES_PATH)
|
||||
app.WindowState = 3
|
||||
|
||||
def open_biao_xlsx(self):
|
||||
app = win32.Dispatch("Excel.Application")
|
||||
app.Visible = True
|
||||
app.Workbooks.Open(BIAO_PATH)
|
||||
app.WindowState = 3
|
||||
|
||||
def open_res1(self):
|
||||
if self.ui.lRes1.text():
|
||||
app = win32.Dispatch("Excel.Application")
|
||||
app.Visible = True
|
||||
self.res1Workbook = app.Workbooks.Open(self.ui.lRes1.text())
|
||||
app.WindowState = 3
|
||||
|
||||
def open_res2(self):
|
||||
if self.ui.lRes2.text():
|
||||
app = win32.Dispatch("Word.Application")
|
||||
app.Visible = True
|
||||
app.Documents.Open(self.ui.lRes2.text())
|
||||
app.WindowState = 3
|
||||
def open_file(self, path, type='xlsx'):
|
||||
if path:
|
||||
# try:
|
||||
# os.startfile(path)
|
||||
# except Exception as e:
|
||||
# print("无法打开文件:", str(e))
|
||||
if type == 'docs':
|
||||
app = win32.Dispatch("Word.Application")
|
||||
app.Visible = True
|
||||
app.Documents.Open(path)
|
||||
app.WindowState = 3
|
||||
elif type == 'xlsx':
|
||||
app = win32.Dispatch("Excel.Application")
|
||||
app.Visible = True
|
||||
app.Workbooks.Open(path)
|
||||
app.WindowState = 3
|
||||
|
||||
def get_time(self):
|
||||
now = datetime.datetime.now()
|
||||
|
@ -276,6 +265,14 @@ class MainWindow(QMainWindow):
|
|||
self.ana_thread.update_signal.connect(self.update_log)
|
||||
self.ana_thread.start()
|
||||
|
||||
def cbma_cal(self):
|
||||
now_year = datetime.datetime.now().year
|
||||
self.update_log({'msg': '正在分析本年总院官微数据...'})
|
||||
output_path, output_path2 = get_cbma_info_from_db_and_ana(now_year)
|
||||
self.ui.lCalRes1.setText(output_path)
|
||||
self.ui.lCalRes2.setText(output_path2)
|
||||
self.update_log({'msg': '分析完毕!'})
|
||||
|
||||
def update_log(self, rdict):
|
||||
if isinstance(rdict, str):
|
||||
self.log(f'{self.get_time()}-{rdict}', False)
|
||||
|
@ -316,7 +313,9 @@ class MainWindow(QMainWindow):
|
|||
|
||||
if __name__ == "__main__":
|
||||
# gen_doc()
|
||||
print('正在启动程序...')
|
||||
app = MyApplication(sys.argv)
|
||||
main_window = app.createMainWindow()
|
||||
main_window.show()
|
||||
print('启动成功')
|
||||
sys.exit(app.exec())
|
|
@ -0,0 +1,8 @@
|
|||
Set objShell = CreateObject("WScript.Shell")
|
||||
strFolder = objShell.CurrentDirectory
|
||||
|
||||
strPythonPath = strFolder & "\runtime\python.exe"
|
||||
strScriptPath = strFolder & "\start.py"
|
||||
|
||||
objShell.Run """" & strPythonPath & """ """ & strScriptPath & """"
|
||||
' Set objExec = objShell.Exec("""" & strPythonPath & """ """ & strScriptPath & """")
|
Binary file not shown.
|
@ -242,7 +242,7 @@ class Ui_MainWindow(object):
|
|||
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
|
||||
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
|
||||
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
|
||||
self.lSize.setText(QCoreApplication.translate("MainWindow", u"5", None))
|
||||
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
|
||||
self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
|
||||
self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))
|
||||
|
|
104
web3.py
104
web3.py
|
@ -33,40 +33,40 @@ def fix_url_scheme(url, default_scheme='http'):
|
|||
url = f'{default_scheme}://{url}'
|
||||
return url
|
||||
if __name__ == '__main__':
|
||||
print('巡查任务开始。。。')
|
||||
now = datetime.datetime.now()
|
||||
month = now.month
|
||||
# print('巡查任务开始。。。')
|
||||
# now = datetime.datetime.now()
|
||||
# month = now.month
|
||||
|
||||
print('正在组合微信公众号爬取内容。。。')
|
||||
make_simple_csv_from_db()
|
||||
make_wechat_articles_full()
|
||||
print('公众号爬取内容组装完毕!')
|
||||
# print('正在组合微信公众号爬取内容。。。')
|
||||
# make_simple_csv_from_db()
|
||||
# make_wechat_articles_full()
|
||||
# print('公众号爬取内容组装完毕!')
|
||||
|
||||
print('开始进行网站爬取。。。')
|
||||
# print('开始进行网站爬取。。。')
|
||||
|
||||
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||
processes = []
|
||||
# processes = []
|
||||
|
||||
# 注册 SIGINT 信号处理函数
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
# # 注册 SIGINT 信号处理函数
|
||||
# signal.signal(signal.SIGINT, sigint_handler)
|
||||
|
||||
ind = 0
|
||||
for ind, row in df.iterrows():
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = fix_url_scheme(row['地址'].strip())
|
||||
domain = urlparse(url).netloc.replace('www.', '')
|
||||
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
|
||||
continue
|
||||
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||
process = subprocess.Popen(cmd)
|
||||
processes.append(process)
|
||||
# ind = 0
|
||||
# for ind, row in df.iterrows():
|
||||
# group = row['单位']
|
||||
# name = row['主办']
|
||||
# url = fix_url_scheme(row['地址'].strip())
|
||||
# domain = urlparse(url).netloc.replace('www.', '')
|
||||
# if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
|
||||
# continue
|
||||
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
# # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||
# process = subprocess.Popen(cmd)
|
||||
# processes.append(process)
|
||||
|
||||
# Wait for all processes to finish
|
||||
for process in processes:
|
||||
process.wait()
|
||||
# # Wait for all processes to finish
|
||||
# for process in processes:
|
||||
# process.wait()
|
||||
|
||||
print('网站爬取结束,校验中。。。')
|
||||
# Check output file sizes and save information if size is less than 20KB
|
||||
|
@ -90,30 +90,30 @@ if __name__ == '__main__':
|
|||
|
||||
print('网站爬取完毕!')
|
||||
|
||||
print('开始对比分析所有内容。。。')
|
||||
# Run WeChat Analysis
|
||||
wechat_results = ana_wechat()
|
||||
# Run Web Content Analysis
|
||||
web_results = ana_web()
|
||||
# print('开始对比分析所有内容。。。')
|
||||
# # Run WeChat Analysis
|
||||
# wechat_results = ana_wechat()
|
||||
# # Run Web Content Analysis
|
||||
# web_results = ana_web()
|
||||
|
||||
# Save results in an Excel file with two sheets
|
||||
output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
||||
# with pd.ExcelWriter(output_excel_path) as writer:
|
||||
# df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||
# df.to_excel(writer, sheet_name='公众号', index=False)
|
||||
# df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||
# df2.to_excel(writer, sheet_name='网站', index=False)
|
||||
template_path = os.path.join(output_dir, 'template.xlsx')
|
||||
workbook = load_workbook(template_path)
|
||||
# # Save results in an Excel file with two sheets
|
||||
# output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
||||
# # with pd.ExcelWriter(output_excel_path) as writer:
|
||||
# # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||
# # df.to_excel(writer, sheet_name='公众号', index=False)
|
||||
# # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||
# # df2.to_excel(writer, sheet_name='网站', index=False)
|
||||
# template_path = os.path.join(output_dir, 'template.xlsx')
|
||||
# workbook = load_workbook(template_path)
|
||||
|
||||
# 选择要操作的工作表
|
||||
wechat_sheet = workbook['公众号']
|
||||
web_sheet = workbook['网站']
|
||||
for row in wechat_results:
|
||||
wechat_sheet.append(row)
|
||||
for row in web_results:
|
||||
web_sheet.append(row)
|
||||
workbook.save(output_excel_path)
|
||||
workbook.close()
|
||||
print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
||||
os.system("pause")
|
||||
# # 选择要操作的工作表
|
||||
# wechat_sheet = workbook['公众号']
|
||||
# web_sheet = workbook['网站']
|
||||
# for row in wechat_results:
|
||||
# wechat_sheet.append(row)
|
||||
# for row in web_results:
|
||||
# web_sheet.append(row)
|
||||
# workbook.save(output_excel_path)
|
||||
# workbook.close()
|
||||
# print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
||||
# os.system("pause")
|
|
@ -4,6 +4,7 @@
|
|||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import os.path
|
||||
from openpyxl import Workbook, load_workbook
|
||||
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
@ -58,6 +59,7 @@ class ZcspiderPipeline:
|
|||
# except:
|
||||
# self.conn.rollback()
|
||||
# raise
|
||||
item['text'] = ILLEGAL_CHARACTERS_RE.sub(r'', item['text'])
|
||||
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
|
||||
self.ws.append(line)
|
||||
return item
|
||||
|
|
|
@ -96,7 +96,7 @@ DEFAULT_REQUEST_HEADERS = {
|
|||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = 'gb18030'
|
||||
LOG_LEVEL = 'WARNING'
|
||||
LOG_LEVEL = 'DEBUG'
|
||||
DOWNLOAD_TIMEOUT = 10
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
|
@ -110,4 +110,7 @@ FEED_EXPORTERS = {
|
|||
DOWNLOADER_MIDDLEWARES = {
|
||||
'zcspider.middlewares.FilterHTMLMiddleware': 200,
|
||||
# 其他下载中间件...
|
||||
}
|
||||
EXTENSIONS = {
|
||||
'scrapy.extensions.telnet.TelnetConsole': None
|
||||
}
|
Loading…
Reference in New Issue