feat: 优化爬虫/输出分析报告
This commit is contained in:
parent
e5ff671c97
commit
7bf87671b9
2
main.ui
2
main.ui
|
@ -478,7 +478,7 @@
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>20</string>
|
<string>5</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
<widget class="QPushButton" name="bStart">
|
<widget class="QPushButton" name="bStart">
|
||||||
|
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from .base import BASE_DIR
|
from .base import BASE_DIR
|
||||||
import os
|
import os
|
||||||
|
from selenium.common.exceptions import TimeoutException
|
||||||
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
|
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
|
||||||
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
|
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
|
||||||
|
|
||||||
|
@ -160,7 +161,6 @@ def add_cookies(driver, cookies):
|
||||||
driver.add_cookie({'name': name, 'value': value})
|
driver.add_cookie({'name': name, 'value': value})
|
||||||
|
|
||||||
def chrom_main_from_list(sites):
|
def chrom_main_from_list(sites):
|
||||||
driver = init_driver()
|
|
||||||
for ind, item in enumerate(sites):
|
for ind, item in enumerate(sites):
|
||||||
group = item[0] # Replace with the actual column name for group
|
group = item[0] # Replace with the actual column name for group
|
||||||
name = item[1]
|
name = item[1]
|
||||||
|
@ -169,29 +169,40 @@ def chrom_main_from_list(sites):
|
||||||
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']:
|
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']:
|
||||||
continue
|
continue
|
||||||
url = fix_url_scheme(url)
|
url = fix_url_scheme(url)
|
||||||
print(url)
|
driver = init_driver()
|
||||||
# Open the website
|
# Open the website
|
||||||
driver.get(url)
|
# driver.get(url)
|
||||||
|
|
||||||
# Retrieve cookies from previous session
|
# # Retrieve cookies from previous session
|
||||||
cookies = get_cookies_from_previous_session(driver)
|
# cookies = get_cookies_from_previous_session(driver)
|
||||||
# Add cookies to the WebDriver
|
# # Add cookies to the WebDriver
|
||||||
add_cookies(driver, cookies)
|
# add_cookies(driver, cookies)
|
||||||
|
|
||||||
# Initialize the set to store visited pages
|
# Initialize the set to store visited pages
|
||||||
visited_pages = set()
|
visited_pages = set()
|
||||||
# Initialize the data list
|
# Initialize the data list
|
||||||
data = []
|
data = []
|
||||||
|
try:
|
||||||
|
# 设置页面加载超时时间为10秒
|
||||||
|
driver.set_page_load_timeout(10)
|
||||||
|
|
||||||
# Process the starting page and follow hyperlinks recursively
|
# 设置脚本执行超时时间为10秒
|
||||||
process_page(driver, url, visited_pages, domain, data, group, name)
|
driver.set_script_timeout(10)
|
||||||
|
|
||||||
|
# 在这里编写你的代码,例如打开网页、点击按钮等操作
|
||||||
|
# ...
|
||||||
|
process_page(driver, url, visited_pages, domain, data, group, name)
|
||||||
|
except TimeoutException:
|
||||||
|
# 当超时异常发生时,进行相应的操作,例如跳过或报错
|
||||||
|
print("超时异常")
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
# Export data to a separate Excel file in the web_dir directory
|
# Export data to a separate Excel file in the web_dir directory
|
||||||
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
export_to_excel(data, output_filename)
|
export_to_excel(data, output_filename)
|
||||||
|
|
||||||
# Close the WebDriver
|
# Close the WebDriver
|
||||||
driver.quit()
|
# driver.quit()
|
||||||
|
|
||||||
def chrome_main():
|
def chrome_main():
|
||||||
# Read failed URLs from the list
|
# Read failed URLs from the list
|
||||||
|
|
|
@ -2,6 +2,8 @@ import pandas as pd
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from mycode.base import BASE_DIR
|
from mycode.base import BASE_DIR
|
||||||
|
import re
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
wechat_dir = os.path.join(BASE_DIR, 'article')
|
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||||
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||||
|
@ -23,6 +25,98 @@ def make_simple_csv_from_db():
|
||||||
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
|
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cbma_info_from_db_and_ana(year: str = '2023'):
|
||||||
|
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
|
||||||
|
query = f'''
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
strftime('%Y年%m月%d日', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_date,
|
||||||
|
g.nickname,
|
||||||
|
a.title,
|
||||||
|
a.content_url,
|
||||||
|
a.read_num
|
||||||
|
FROM
|
||||||
|
articles a
|
||||||
|
LEFT JOIN
|
||||||
|
gzhs g ON g.biz = a.biz
|
||||||
|
WHERE
|
||||||
|
pub_date > '{year}'
|
||||||
|
AND
|
||||||
|
g.biz = 'MzIzMDU4Njg3MA=='
|
||||||
|
ORDER BY
|
||||||
|
pub_date
|
||||||
|
'''
|
||||||
|
df = pd.read_sql_query(query, conn)
|
||||||
|
# 关闭数据库连接
|
||||||
|
conn.close()
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
id = row['id']
|
||||||
|
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
|
||||||
|
try:
|
||||||
|
with open(full_path, encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
# 从content中获取来源
|
||||||
|
a_match = re.findall('来源丨(.*?)\n', content)
|
||||||
|
a_list = []
|
||||||
|
if a_match:
|
||||||
|
# a = a_match[0].replace('\xa0', '、').replace(' ', '、')
|
||||||
|
# a = re.sub(r'、+', '、', a)
|
||||||
|
a = re.sub(r'[\xa0\s]+', '、', a_match[0])
|
||||||
|
df.at[ind, 'source'] = a
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(full_path + '---不存在')
|
||||||
|
# 填充到execl中
|
||||||
|
template_path = os.path.join(BASE_DIR, 'summary/template_cbma.xlsx')
|
||||||
|
workbook = load_workbook(template_path)
|
||||||
|
sheet = workbook['公众号更新数']
|
||||||
|
sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(官微)')
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
sheet.cell(row=ind+3, column=1, value=str(ind+1))
|
||||||
|
sheet.cell(row=ind+3, column=2, value=row['pub_date'])
|
||||||
|
sheet.cell(row=ind+3, column=3, value=row['title'])
|
||||||
|
sheet.cell(row=ind+3, column=4, value=row['source'])
|
||||||
|
sheet.cell(row=ind+3, column=6, value=row['read_num'])
|
||||||
|
sheet.cell(row=ind+3, column=7, value=row['content_url'])
|
||||||
|
output_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma.xlsx')
|
||||||
|
workbook.save(output_path)
|
||||||
|
# 开始统计分数
|
||||||
|
t_1 = (df['source'].str.contains('瑞泰科技')).sum()
|
||||||
|
t_2 = (df['source'].str.contains('国检集团')).sum()
|
||||||
|
t_3 = (df['source'].str.contains('中材高新')).sum()
|
||||||
|
t_4 = (df['source'].str.contains('哈玻院')).sum()
|
||||||
|
t_5 = (df['source'].str.contains('中国新材院')).sum()
|
||||||
|
t_6 = (df['source'].str.contains('秦皇岛院')).sum()
|
||||||
|
t_7 = (df['source'].str.contains('西安墙材院')).sum()
|
||||||
|
t_8 = (df['source'].str.contains('咸阳陶瓷院')).sum()
|
||||||
|
t_9 = (df['source'].str.contains('钟表所')).sum()
|
||||||
|
t_10 = (df['source'].str.contains('总院北分')).sum()
|
||||||
|
t_11 = (df['source'].str.contains('中岩科技')).sum()
|
||||||
|
t_12 = (df['source'].str.contains('水泥新材院')).sum()
|
||||||
|
t_13 = (df['source'].str.contains('中建材科创院')).sum()
|
||||||
|
t_14 = (df['source'].str.contains('科建苑')).sum()
|
||||||
|
template_cal_path = os.path.join(BASE_DIR, 'summary/tempalte_cbma_cal.xlsx')
|
||||||
|
workbook2 = load_workbook(template_cal_path)
|
||||||
|
sheet2= workbook2['打分表']
|
||||||
|
sheet2.cell(row=1, column=1, value=f'中国建材总院宣传工作计分表({year}年度)')
|
||||||
|
sheet2.cell(row=6, column=5, value=t_1)
|
||||||
|
sheet2.cell(row=6, column=7, value=t_2)
|
||||||
|
sheet2.cell(row=6, column=9, value=t_3)
|
||||||
|
sheet2.cell(row=6, column=11, value=t_4)
|
||||||
|
sheet2.cell(row=6, column=13, value=t_5)
|
||||||
|
sheet2.cell(row=6, column=15, value=t_6)
|
||||||
|
sheet2.cell(row=6, column=17, value=t_7)
|
||||||
|
sheet2.cell(row=6, column=19, value=t_8)
|
||||||
|
sheet2.cell(row=6, column=21, value=t_9)
|
||||||
|
sheet2.cell(row=6, column=23, value=t_10)
|
||||||
|
sheet2.cell(row=6, column=25, value=t_11)
|
||||||
|
sheet2.cell(row=6, column=27, value=t_12)
|
||||||
|
sheet2.cell(row=6, column=29, value=t_13)
|
||||||
|
sheet2.cell(row=6, column=31, value=t_14)
|
||||||
|
output_path2 = os.path.join(BASE_DIR, f'summary/{year}年_cbma_cal.xlsx')
|
||||||
|
workbook2.save(output_path2)
|
||||||
|
return output_path, output_path2
|
||||||
|
|
||||||
|
|
||||||
def make_wechat_articles_full():
|
def make_wechat_articles_full():
|
||||||
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
|
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
|
||||||
df['content'] = ''
|
df['content'] = ''
|
||||||
|
@ -110,5 +204,5 @@ def ana_web():
|
||||||
return output_data
|
return output_data
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ana_web()
|
get_cbma_info_from_db_and_ana()
|
||||||
|
|
||||||
|
|
77
start.py
77
start.py
|
@ -7,7 +7,7 @@ import win32com.client as win32
|
||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir, get_cbma_info_from_db_and_ana
|
||||||
from mycode.crawl_chrome import chrom_main_from_list
|
from mycode.crawl_chrome import chrom_main_from_list
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -140,12 +140,6 @@ class MyThread(QThread):
|
||||||
if output:
|
if output:
|
||||||
self.update_signal.emit({'msg': output.strip()})
|
self.update_signal.emit({'msg': output.strip()})
|
||||||
|
|
||||||
def capture_err(self, p):
|
|
||||||
while self.running and p.poll() is None:
|
|
||||||
err = p.stderr.readline()
|
|
||||||
if err:
|
|
||||||
self.update_signal.emit({'msg': err.strip()})
|
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
self.update_signal.emit({'msg': '开始进行网站爬取...'})
|
self.update_signal.emit({'msg': '开始进行网站爬取...'})
|
||||||
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
|
@ -158,13 +152,14 @@ class MyThread(QThread):
|
||||||
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
# -u 代表不缓冲,直接输出
|
# -u 代表不缓冲,直接输出
|
||||||
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||||
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
|
# cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||||
|
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, shell=False)
|
||||||
self.processes.append(process)
|
self.processes.append(process)
|
||||||
self.running = True
|
self.running = True
|
||||||
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
|
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
|
||||||
getlog_thread.start()
|
getlog_thread.start()
|
||||||
getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
|
# getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
|
||||||
getlog_thread_err.start()
|
# getlog_thread_err.start()
|
||||||
|
|
||||||
for process in self.processes:
|
for process in self.processes:
|
||||||
process.wait()
|
process.wait()
|
||||||
|
@ -205,12 +200,15 @@ class MainWindow(QMainWindow):
|
||||||
self.ui.setupUi(self)
|
self.ui.setupUi(self)
|
||||||
self.ui.lSize.setValidator(QIntValidator())
|
self.ui.lSize.setValidator(QIntValidator())
|
||||||
self.ui.bWechat.clicked.connect(self.open_wcplus)
|
self.ui.bWechat.clicked.connect(self.open_wcplus)
|
||||||
self.ui.bWebSite.clicked.connect(self.open_websites_xlsx)
|
self.ui.bWebSite.clicked.connect(lambda: self.open_file(WEB_SITES_PATH))
|
||||||
self.ui.bBiao.clicked.connect(self.open_biao_xlsx)
|
self.ui.bBiao.clicked.connect(lambda: self.open_file(BIAO_PATH))
|
||||||
self.ui.bStart.clicked.connect(self.start)
|
self.ui.bStart.clicked.connect(self.start)
|
||||||
self.ui.bAna.clicked.connect(self.start_ana)
|
self.ui.bAna.clicked.connect(self.start_ana)
|
||||||
self.ui.bRes1.clicked.connect(self.open_res1)
|
self.ui.bRes1.clicked.connect(lambda: self.open_file(self.ui.lRes1.text()))
|
||||||
self.ui.bRes2.clicked.connect(self.open_res2)
|
self.ui.bRes2.clicked.connect(lambda: self.open_file(self.ui.lRes2.text(), 'docx'))
|
||||||
|
self.ui.bCal.clicked.connect(self.cbma_cal)
|
||||||
|
self.ui.bOpenCalRes1.clicked.connect(lambda: self.open_file(self.ui.lCalRes1.text()))
|
||||||
|
self.ui.bOpenCalRes2.clicked.connect(lambda: self.open_file(self.ui.lCalRes2.text()))
|
||||||
self.ui.vLog.setModel(self.logModel)
|
self.ui.vLog.setModel(self.logModel)
|
||||||
self.res1Workbook = None
|
self.res1Workbook = None
|
||||||
|
|
||||||
|
@ -219,31 +217,22 @@ class MainWindow(QMainWindow):
|
||||||
subprocess.Popen('.\wcplus.exe')
|
subprocess.Popen('.\wcplus.exe')
|
||||||
self.wcplus = True
|
self.wcplus = True
|
||||||
|
|
||||||
def open_websites_xlsx(self):
|
def open_file(self, path, type='xlsx'):
|
||||||
app = win32.Dispatch("Excel.Application")
|
if path:
|
||||||
app.Visible = True
|
# try:
|
||||||
app.Workbooks.Open(WEB_SITES_PATH)
|
# os.startfile(path)
|
||||||
app.WindowState = 3
|
# except Exception as e:
|
||||||
|
# print("无法打开文件:", str(e))
|
||||||
def open_biao_xlsx(self):
|
if type == 'docs':
|
||||||
app = win32.Dispatch("Excel.Application")
|
app = win32.Dispatch("Word.Application")
|
||||||
app.Visible = True
|
app.Visible = True
|
||||||
app.Workbooks.Open(BIAO_PATH)
|
app.Documents.Open(path)
|
||||||
app.WindowState = 3
|
app.WindowState = 3
|
||||||
|
elif type == 'xlsx':
|
||||||
def open_res1(self):
|
app = win32.Dispatch("Excel.Application")
|
||||||
if self.ui.lRes1.text():
|
app.Visible = True
|
||||||
app = win32.Dispatch("Excel.Application")
|
app.Workbooks.Open(path)
|
||||||
app.Visible = True
|
app.WindowState = 3
|
||||||
self.res1Workbook = app.Workbooks.Open(self.ui.lRes1.text())
|
|
||||||
app.WindowState = 3
|
|
||||||
|
|
||||||
def open_res2(self):
|
|
||||||
if self.ui.lRes2.text():
|
|
||||||
app = win32.Dispatch("Word.Application")
|
|
||||||
app.Visible = True
|
|
||||||
app.Documents.Open(self.ui.lRes2.text())
|
|
||||||
app.WindowState = 3
|
|
||||||
|
|
||||||
def get_time(self):
|
def get_time(self):
|
||||||
now = datetime.datetime.now()
|
now = datetime.datetime.now()
|
||||||
|
@ -276,6 +265,14 @@ class MainWindow(QMainWindow):
|
||||||
self.ana_thread.update_signal.connect(self.update_log)
|
self.ana_thread.update_signal.connect(self.update_log)
|
||||||
self.ana_thread.start()
|
self.ana_thread.start()
|
||||||
|
|
||||||
|
def cbma_cal(self):
|
||||||
|
now_year = datetime.datetime.now().year
|
||||||
|
self.update_log({'msg': '正在分析本年总院官微数据...'})
|
||||||
|
output_path, output_path2 = get_cbma_info_from_db_and_ana(now_year)
|
||||||
|
self.ui.lCalRes1.setText(output_path)
|
||||||
|
self.ui.lCalRes2.setText(output_path2)
|
||||||
|
self.update_log({'msg': '分析完毕!'})
|
||||||
|
|
||||||
def update_log(self, rdict):
|
def update_log(self, rdict):
|
||||||
if isinstance(rdict, str):
|
if isinstance(rdict, str):
|
||||||
self.log(f'{self.get_time()}-{rdict}', False)
|
self.log(f'{self.get_time()}-{rdict}', False)
|
||||||
|
@ -316,7 +313,9 @@ class MainWindow(QMainWindow):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# gen_doc()
|
# gen_doc()
|
||||||
|
print('正在启动程序...')
|
||||||
app = MyApplication(sys.argv)
|
app = MyApplication(sys.argv)
|
||||||
main_window = app.createMainWindow()
|
main_window = app.createMainWindow()
|
||||||
main_window.show()
|
main_window.show()
|
||||||
|
print('启动成功')
|
||||||
sys.exit(app.exec())
|
sys.exit(app.exec())
|
|
@ -0,0 +1,8 @@
|
||||||
|
Set objShell = CreateObject("WScript.Shell")
|
||||||
|
strFolder = objShell.CurrentDirectory
|
||||||
|
|
||||||
|
strPythonPath = strFolder & "\runtime\python.exe"
|
||||||
|
strScriptPath = strFolder & "\start.py"
|
||||||
|
|
||||||
|
objShell.Run """" & strPythonPath & """ """ & strScriptPath & """"
|
||||||
|
' Set objExec = objShell.Exec("""" & strPythonPath & """ """ & strScriptPath & """")
|
Binary file not shown.
|
@ -242,7 +242,7 @@ class Ui_MainWindow(object):
|
||||||
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||||
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
|
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
|
||||||
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
|
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
|
||||||
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
|
self.lSize.setText(QCoreApplication.translate("MainWindow", u"5", None))
|
||||||
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
|
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
|
||||||
self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
|
self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
|
||||||
self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))
|
self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))
|
||||||
|
|
104
web3.py
104
web3.py
|
@ -33,40 +33,40 @@ def fix_url_scheme(url, default_scheme='http'):
|
||||||
url = f'{default_scheme}://{url}'
|
url = f'{default_scheme}://{url}'
|
||||||
return url
|
return url
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print('巡查任务开始。。。')
|
# print('巡查任务开始。。。')
|
||||||
now = datetime.datetime.now()
|
# now = datetime.datetime.now()
|
||||||
month = now.month
|
# month = now.month
|
||||||
|
|
||||||
print('正在组合微信公众号爬取内容。。。')
|
# print('正在组合微信公众号爬取内容。。。')
|
||||||
make_simple_csv_from_db()
|
# make_simple_csv_from_db()
|
||||||
make_wechat_articles_full()
|
# make_wechat_articles_full()
|
||||||
print('公众号爬取内容组装完毕!')
|
# print('公众号爬取内容组装完毕!')
|
||||||
|
|
||||||
print('开始进行网站爬取。。。')
|
# print('开始进行网站爬取。。。')
|
||||||
|
|
||||||
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
processes = []
|
# processes = []
|
||||||
|
|
||||||
# 注册 SIGINT 信号处理函数
|
# # 注册 SIGINT 信号处理函数
|
||||||
signal.signal(signal.SIGINT, sigint_handler)
|
# signal.signal(signal.SIGINT, sigint_handler)
|
||||||
|
|
||||||
ind = 0
|
# ind = 0
|
||||||
for ind, row in df.iterrows():
|
# for ind, row in df.iterrows():
|
||||||
group = row['单位']
|
# group = row['单位']
|
||||||
name = row['主办']
|
# name = row['主办']
|
||||||
url = fix_url_scheme(row['地址'].strip())
|
# url = fix_url_scheme(row['地址'].strip())
|
||||||
domain = urlparse(url).netloc.replace('www.', '')
|
# domain = urlparse(url).netloc.replace('www.', '')
|
||||||
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
|
# if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
|
||||||
continue
|
# continue
|
||||||
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
# # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||||
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||||
process = subprocess.Popen(cmd)
|
# process = subprocess.Popen(cmd)
|
||||||
processes.append(process)
|
# processes.append(process)
|
||||||
|
|
||||||
# Wait for all processes to finish
|
# # Wait for all processes to finish
|
||||||
for process in processes:
|
# for process in processes:
|
||||||
process.wait()
|
# process.wait()
|
||||||
|
|
||||||
print('网站爬取结束,校验中。。。')
|
print('网站爬取结束,校验中。。。')
|
||||||
# Check output file sizes and save information if size is less than 20KB
|
# Check output file sizes and save information if size is less than 20KB
|
||||||
|
@ -90,30 +90,30 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
print('网站爬取完毕!')
|
print('网站爬取完毕!')
|
||||||
|
|
||||||
print('开始对比分析所有内容。。。')
|
# print('开始对比分析所有内容。。。')
|
||||||
# Run WeChat Analysis
|
# # Run WeChat Analysis
|
||||||
wechat_results = ana_wechat()
|
# wechat_results = ana_wechat()
|
||||||
# Run Web Content Analysis
|
# # Run Web Content Analysis
|
||||||
web_results = ana_web()
|
# web_results = ana_web()
|
||||||
|
|
||||||
# Save results in an Excel file with two sheets
|
# # Save results in an Excel file with two sheets
|
||||||
output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
# output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
||||||
# with pd.ExcelWriter(output_excel_path) as writer:
|
# # with pd.ExcelWriter(output_excel_path) as writer:
|
||||||
# df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
# # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
# df.to_excel(writer, sheet_name='公众号', index=False)
|
# # df.to_excel(writer, sheet_name='公众号', index=False)
|
||||||
# df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
# # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
# df2.to_excel(writer, sheet_name='网站', index=False)
|
# # df2.to_excel(writer, sheet_name='网站', index=False)
|
||||||
template_path = os.path.join(output_dir, 'template.xlsx')
|
# template_path = os.path.join(output_dir, 'template.xlsx')
|
||||||
workbook = load_workbook(template_path)
|
# workbook = load_workbook(template_path)
|
||||||
|
|
||||||
# 选择要操作的工作表
|
# # 选择要操作的工作表
|
||||||
wechat_sheet = workbook['公众号']
|
# wechat_sheet = workbook['公众号']
|
||||||
web_sheet = workbook['网站']
|
# web_sheet = workbook['网站']
|
||||||
for row in wechat_results:
|
# for row in wechat_results:
|
||||||
wechat_sheet.append(row)
|
# wechat_sheet.append(row)
|
||||||
for row in web_results:
|
# for row in web_results:
|
||||||
web_sheet.append(row)
|
# web_sheet.append(row)
|
||||||
workbook.save(output_excel_path)
|
# workbook.save(output_excel_path)
|
||||||
workbook.close()
|
# workbook.close()
|
||||||
print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
# print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
||||||
os.system("pause")
|
# os.system("pause")
|
|
@ -4,6 +4,7 @@
|
||||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
import os.path
|
import os.path
|
||||||
from openpyxl import Workbook, load_workbook
|
from openpyxl import Workbook, load_workbook
|
||||||
|
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
# useful for handling different item types with a single interface
|
||||||
from scrapy.exceptions import IgnoreRequest
|
from scrapy.exceptions import IgnoreRequest
|
||||||
|
@ -58,6 +59,7 @@ class ZcspiderPipeline:
|
||||||
# except:
|
# except:
|
||||||
# self.conn.rollback()
|
# self.conn.rollback()
|
||||||
# raise
|
# raise
|
||||||
|
item['text'] = ILLEGAL_CHARACTERS_RE.sub(r'', item['text'])
|
||||||
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
|
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
|
||||||
self.ws.append(line)
|
self.ws.append(line)
|
||||||
return item
|
return item
|
||||||
|
|
|
@ -96,7 +96,7 @@ DEFAULT_REQUEST_HEADERS = {
|
||||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
FEED_EXPORT_ENCODING = 'gb18030'
|
FEED_EXPORT_ENCODING = 'gb18030'
|
||||||
LOG_LEVEL = 'WARNING'
|
LOG_LEVEL = 'DEBUG'
|
||||||
DOWNLOAD_TIMEOUT = 10
|
DOWNLOAD_TIMEOUT = 10
|
||||||
|
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
|
@ -111,3 +111,6 @@ DOWNLOADER_MIDDLEWARES = {
|
||||||
'zcspider.middlewares.FilterHTMLMiddleware': 200,
|
'zcspider.middlewares.FilterHTMLMiddleware': 200,
|
||||||
# 其他下载中间件...
|
# 其他下载中间件...
|
||||||
}
|
}
|
||||||
|
EXTENSIONS = {
|
||||||
|
'scrapy.extensions.telnet.TelnetConsole': None
|
||||||
|
}
|
Loading…
Reference in New Issue