diff --git a/main.ui b/main.ui index 00dc8f7..94d160a 100644 --- a/main.ui +++ b/main.ui @@ -478,7 +478,7 @@ - 20 + 5 diff --git a/mycode/crawl_chrome.py b/mycode/crawl_chrome.py index d5a6603..34d06e0 100644 --- a/mycode/crawl_chrome.py +++ b/mycode/crawl_chrome.py @@ -7,6 +7,7 @@ from pathlib import Path import pandas as pd from .base import BASE_DIR import os +from selenium.common.exceptions import TimeoutException chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe') failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx') @@ -160,7 +161,6 @@ def add_cookies(driver, cookies): driver.add_cookie({'name': name, 'value': value}) def chrom_main_from_list(sites): - driver = init_driver() for ind, item in enumerate(sites): group = item[0] # Replace with the actual column name for group name = item[1] @@ -169,29 +169,40 @@ def chrom_main_from_list(sites): if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: continue url = fix_url_scheme(url) - print(url) + driver = init_driver() # Open the website - driver.get(url) + # driver.get(url) - # Retrieve cookies from previous session - cookies = get_cookies_from_previous_session(driver) - # Add cookies to the WebDriver - add_cookies(driver, cookies) + # # Retrieve cookies from previous session + # cookies = get_cookies_from_previous_session(driver) + # # Add cookies to the WebDriver + # add_cookies(driver, cookies) # Initialize the set to store visited pages visited_pages = set() # Initialize the data list data = [] + try: + # 设置页面加载超时时间为10秒 + driver.set_page_load_timeout(10) + + # 设置脚本执行超时时间为10秒 + driver.set_script_timeout(10) - # Process the starting page and follow hyperlinks recursively - process_page(driver, url, visited_pages, domain, data, group, name) + # 在这里编写你的代码,例如打开网页、点击按钮等操作 + # ... + process_page(driver, url, visited_pages, domain, data, group, name) + except TimeoutException: + # 当超时异常发生时,进行相应的操作,例如跳过或报错 + print("超时异常") + driver.quit() # Export data to a separate Excel file in the web_dir directory output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') export_to_excel(data, output_filename) # Close the WebDriver - driver.quit() + # driver.quit() def chrome_main(): # Read failed URLs from the list diff --git a/mycode/main.py b/mycode/main.py index b3ecc6c..4345c97 100644 --- a/mycode/main.py +++ b/mycode/main.py @@ -2,6 +2,8 @@ import pandas as pd import os import sqlite3 from mycode.base import BASE_DIR +import re +from openpyxl import load_workbook wechat_dir = os.path.join(BASE_DIR, 'article') web_dir = os.path.join(BASE_DIR, 'web_dir') @@ -23,6 +25,98 @@ def make_simple_csv_from_db(): df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False) +def get_cbma_info_from_db_and_ana(year: str = '2023'): + conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db')) + query = f''' + SELECT + id, + strftime('%Y年%m月%d日', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_date, + g.nickname, + a.title, + a.content_url, + a.read_num + FROM + articles a + LEFT JOIN + gzhs g ON g.biz = a.biz + WHERE + pub_date > '{year}' + AND + g.biz = 'MzIzMDU4Njg3MA==' + ORDER BY + pub_date + ''' + df = pd.read_sql_query(query, conn) + # 关闭数据库连接 + conn.close() + for ind, row in df.iterrows(): + id = row['id'] + full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md') + try: + with open(full_path, encoding='utf-8') as f: + content = f.read() + # 从content中获取来源 + a_match = re.findall('来源丨(.*?)\n', content) + a_list = [] + if a_match: + # a = a_match[0].replace('\xa0', '、').replace(' ', '、') + # a = re.sub(r'、+', '、', a) + a = re.sub(r'[\xa0\s]+', '、', a_match[0]) + df.at[ind, 'source'] = a + except FileNotFoundError: + print(full_path + '---不存在') + # 填充到execl中 + template_path = os.path.join(BASE_DIR, 'summary/template_cbma.xlsx') + workbook = load_workbook(template_path) + sheet = workbook['公众号更新数'] + sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(官微)') + for ind, row in df.iterrows(): + sheet.cell(row=ind+3, column=1, value=str(ind+1)) + sheet.cell(row=ind+3, column=2, value=row['pub_date']) + sheet.cell(row=ind+3, column=3, value=row['title']) + sheet.cell(row=ind+3, column=4, value=row['source']) + sheet.cell(row=ind+3, column=6, value=row['read_num']) + sheet.cell(row=ind+3, column=7, value=row['content_url']) + output_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma.xlsx') + workbook.save(output_path) + # 开始统计分数 + t_1 = (df['source'].str.contains('瑞泰科技')).sum() + t_2 = (df['source'].str.contains('国检集团')).sum() + t_3 = (df['source'].str.contains('中材高新')).sum() + t_4 = (df['source'].str.contains('哈玻院')).sum() + t_5 = (df['source'].str.contains('中国新材院')).sum() + t_6 = (df['source'].str.contains('秦皇岛院')).sum() + t_7 = (df['source'].str.contains('西安墙材院')).sum() + t_8 = (df['source'].str.contains('咸阳陶瓷院')).sum() + t_9 = (df['source'].str.contains('钟表所')).sum() + t_10 = (df['source'].str.contains('总院北分')).sum() + t_11 = (df['source'].str.contains('中岩科技')).sum() + t_12 = (df['source'].str.contains('水泥新材院')).sum() + t_13 = (df['source'].str.contains('中建材科创院')).sum() + t_14 = (df['source'].str.contains('科建苑')).sum() + template_cal_path = os.path.join(BASE_DIR, 'summary/tempalte_cbma_cal.xlsx') + workbook2 = load_workbook(template_cal_path) + sheet2= workbook2['打分表'] + sheet2.cell(row=1, column=1, value=f'中国建材总院宣传工作计分表({year}年度)') + sheet2.cell(row=6, column=5, value=t_1) + sheet2.cell(row=6, column=7, value=t_2) + sheet2.cell(row=6, column=9, value=t_3) + sheet2.cell(row=6, column=11, value=t_4) + sheet2.cell(row=6, column=13, value=t_5) + sheet2.cell(row=6, column=15, value=t_6) + sheet2.cell(row=6, column=17, value=t_7) + sheet2.cell(row=6, column=19, value=t_8) + sheet2.cell(row=6, column=21, value=t_9) + sheet2.cell(row=6, column=23, value=t_10) + sheet2.cell(row=6, column=25, value=t_11) + sheet2.cell(row=6, column=27, value=t_12) + sheet2.cell(row=6, column=29, value=t_13) + sheet2.cell(row=6, column=31, value=t_14) + output_path2 = os.path.join(BASE_DIR, f'summary/{year}年_cbma_cal.xlsx') + workbook2.save(output_path2) + return output_path, output_path2 + + def make_wechat_articles_full(): df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv')) df['content'] = '' @@ -110,5 +204,5 @@ def ana_web(): return output_data if __name__ == "__main__": - ana_web() + get_cbma_info_from_db_and_ana() diff --git a/网络巡查.bat b/start.bat similarity index 100% rename from 网络巡查.bat rename to start.bat diff --git a/start.py b/start.py index 847f0aa..f089e71 100644 --- a/start.py +++ b/start.py @@ -7,7 +7,7 @@ import win32com.client as win32 import subprocess import os import datetime -from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir +from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir, get_cbma_info_from_db_and_ana from mycode.crawl_chrome import chrom_main_from_list import pandas as pd from urllib.parse import urlparse @@ -139,12 +139,6 @@ class MyThread(QThread): output = p.stdout.readline() if output: self.update_signal.emit({'msg': output.strip()}) - - def capture_err(self, p): - while self.running and p.poll() is None: - err = p.stderr.readline() - if err: - self.update_signal.emit({'msg': err.strip()}) def run(self) -> None: self.update_signal.emit({'msg': '开始进行网站爬取...'}) @@ -158,13 +152,14 @@ class MyThread(QThread): output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') # -u 代表不缓冲,直接输出 cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False) + # cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, shell=False) self.processes.append(process) self.running = True getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True) getlog_thread.start() - getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True) - getlog_thread_err.start() + # getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True) + # getlog_thread_err.start() for process in self.processes: process.wait() @@ -205,12 +200,15 @@ class MainWindow(QMainWindow): self.ui.setupUi(self) self.ui.lSize.setValidator(QIntValidator()) self.ui.bWechat.clicked.connect(self.open_wcplus) - self.ui.bWebSite.clicked.connect(self.open_websites_xlsx) - self.ui.bBiao.clicked.connect(self.open_biao_xlsx) + self.ui.bWebSite.clicked.connect(lambda: self.open_file(WEB_SITES_PATH)) + self.ui.bBiao.clicked.connect(lambda: self.open_file(BIAO_PATH)) self.ui.bStart.clicked.connect(self.start) self.ui.bAna.clicked.connect(self.start_ana) - self.ui.bRes1.clicked.connect(self.open_res1) - self.ui.bRes2.clicked.connect(self.open_res2) + self.ui.bRes1.clicked.connect(lambda: self.open_file(self.ui.lRes1.text())) + self.ui.bRes2.clicked.connect(lambda: self.open_file(self.ui.lRes2.text(), 'docx')) + self.ui.bCal.clicked.connect(self.cbma_cal) + self.ui.bOpenCalRes1.clicked.connect(lambda: self.open_file(self.ui.lCalRes1.text())) + self.ui.bOpenCalRes2.clicked.connect(lambda: self.open_file(self.ui.lCalRes2.text())) self.ui.vLog.setModel(self.logModel) self.res1Workbook = None @@ -219,31 +217,22 @@ class MainWindow(QMainWindow): subprocess.Popen('.\wcplus.exe') self.wcplus = True - def open_websites_xlsx(self): - app = win32.Dispatch("Excel.Application") - app.Visible = True - app.Workbooks.Open(WEB_SITES_PATH) - app.WindowState = 3 - - def open_biao_xlsx(self): - app = win32.Dispatch("Excel.Application") - app.Visible = True - app.Workbooks.Open(BIAO_PATH) - app.WindowState = 3 - - def open_res1(self): - if self.ui.lRes1.text(): - app = win32.Dispatch("Excel.Application") - app.Visible = True - self.res1Workbook = app.Workbooks.Open(self.ui.lRes1.text()) - app.WindowState = 3 - - def open_res2(self): - if self.ui.lRes2.text(): - app = win32.Dispatch("Word.Application") - app.Visible = True - app.Documents.Open(self.ui.lRes2.text()) - app.WindowState = 3 + def open_file(self, path, type='xlsx'): + if path: + # try: + # os.startfile(path) + # except Exception as e: + # print("无法打开文件:", str(e)) + if type == 'docs': + app = win32.Dispatch("Word.Application") + app.Visible = True + app.Documents.Open(path) + app.WindowState = 3 + elif type == 'xlsx': + app = win32.Dispatch("Excel.Application") + app.Visible = True + app.Workbooks.Open(path) + app.WindowState = 3 def get_time(self): now = datetime.datetime.now() @@ -276,6 +265,14 @@ class MainWindow(QMainWindow): self.ana_thread.update_signal.connect(self.update_log) self.ana_thread.start() + def cbma_cal(self): + now_year = datetime.datetime.now().year + self.update_log({'msg': '正在分析本年总院官微数据...'}) + output_path, output_path2 = get_cbma_info_from_db_and_ana(now_year) + self.ui.lCalRes1.setText(output_path) + self.ui.lCalRes2.setText(output_path2) + self.update_log({'msg': '分析完毕!'}) + def update_log(self, rdict): if isinstance(rdict, str): self.log(f'{self.get_time()}-{rdict}', False) @@ -316,7 +313,9 @@ class MainWindow(QMainWindow): if __name__ == "__main__": # gen_doc() + print('正在启动程序...') app = MyApplication(sys.argv) main_window = app.createMainWindow() main_window.show() + print('启动成功') sys.exit(app.exec()) \ No newline at end of file diff --git a/start.vbs b/start.vbs new file mode 100644 index 0000000..f33cbe9 --- /dev/null +++ b/start.vbs @@ -0,0 +1,8 @@ +Set objShell = CreateObject("WScript.Shell") +strFolder = objShell.CurrentDirectory + +strPythonPath = strFolder & "\runtime\python.exe" +strScriptPath = strFolder & "\start.py" + +objShell.Run """" & strPythonPath & """ """ & strScriptPath & """" +' Set objExec = objShell.Exec("""" & strPythonPath & """ """ & strScriptPath & """") diff --git a/summary/template_cbma.xlsx b/summary/template_cbma.xlsx new file mode 100644 index 0000000..9bd26a7 Binary files /dev/null and b/summary/template_cbma.xlsx differ diff --git a/ui_mainwindow.py b/ui_mainwindow.py index 38d6bb9..72fa742 100644 --- a/ui_mainwindow.py +++ b/ui_mainwindow.py @@ -242,7 +242,7 @@ class Ui_MainWindow(object): self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None)) self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None)) - self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None)) + self.lSize.setText(QCoreApplication.translate("MainWindow", u"5", None)) self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None)) self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None)) self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None)) diff --git a/web3.py b/web3.py index beeebf8..7580272 100644 --- a/web3.py +++ b/web3.py @@ -33,40 +33,40 @@ def fix_url_scheme(url, default_scheme='http'): url = f'{default_scheme}://{url}' return url if __name__ == '__main__': - print('巡查任务开始。。。') - now = datetime.datetime.now() - month = now.month + # print('巡查任务开始。。。') + # now = datetime.datetime.now() + # month = now.month - print('正在组合微信公众号爬取内容。。。') - make_simple_csv_from_db() - make_wechat_articles_full() - print('公众号爬取内容组装完毕!') + # print('正在组合微信公众号爬取内容。。。') + # make_simple_csv_from_db() + # make_wechat_articles_full() + # print('公众号爬取内容组装完毕!') - print('开始进行网站爬取。。。') + # print('开始进行网站爬取。。。') df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') - processes = [] + # processes = [] - # 注册 SIGINT 信号处理函数 - signal.signal(signal.SIGINT, sigint_handler) + # # 注册 SIGINT 信号处理函数 + # signal.signal(signal.SIGINT, sigint_handler) - ind = 0 - for ind, row in df.iterrows(): - group = row['单位'] - name = row['主办'] - url = fix_url_scheme(row['地址'].strip()) - domain = urlparse(url).netloc.replace('www.', '') - if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过 - continue - output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') - # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] - cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] - process = subprocess.Popen(cmd) - processes.append(process) + # ind = 0 + # for ind, row in df.iterrows(): + # group = row['单位'] + # name = row['主办'] + # url = fix_url_scheme(row['地址'].strip()) + # domain = urlparse(url).netloc.replace('www.', '') + # if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过 + # continue + # output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') + # # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] + # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] + # process = subprocess.Popen(cmd) + # processes.append(process) - # Wait for all processes to finish - for process in processes: - process.wait() + # # Wait for all processes to finish + # for process in processes: + # process.wait() print('网站爬取结束,校验中。。。') # Check output file sizes and save information if size is less than 20KB @@ -90,30 +90,30 @@ if __name__ == '__main__': print('网站爬取完毕!') - print('开始对比分析所有内容。。。') - # Run WeChat Analysis - wechat_results = ana_wechat() - # Run Web Content Analysis - web_results = ana_web() + # print('开始对比分析所有内容。。。') + # # Run WeChat Analysis + # wechat_results = ana_wechat() + # # Run Web Content Analysis + # web_results = ana_web() - # Save results in an Excel file with two sheets - output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx') - # with pd.ExcelWriter(output_excel_path) as writer: - # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) - # df.to_excel(writer, sheet_name='公众号', index=False) - # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) - # df2.to_excel(writer, sheet_name='网站', index=False) - template_path = os.path.join(output_dir, 'template.xlsx') - workbook = load_workbook(template_path) + # # Save results in an Excel file with two sheets + # output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx') + # # with pd.ExcelWriter(output_excel_path) as writer: + # # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) + # # df.to_excel(writer, sheet_name='公众号', index=False) + # # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) + # # df2.to_excel(writer, sheet_name='网站', index=False) + # template_path = os.path.join(output_dir, 'template.xlsx') + # workbook = load_workbook(template_path) - # 选择要操作的工作表 - wechat_sheet = workbook['公众号'] - web_sheet = workbook['网站'] - for row in wechat_results: - wechat_sheet.append(row) - for row in web_results: - web_sheet.append(row) - workbook.save(output_excel_path) - workbook.close() - print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对') - os.system("pause") \ No newline at end of file + # # 选择要操作的工作表 + # wechat_sheet = workbook['公众号'] + # web_sheet = workbook['网站'] + # for row in wechat_results: + # wechat_sheet.append(row) + # for row in web_results: + # web_sheet.append(row) + # workbook.save(output_excel_path) + # workbook.close() + # print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对') + # os.system("pause") \ No newline at end of file diff --git a/zcspider/pipelines.py b/zcspider/pipelines.py index 31db3fa..cdc6b24 100644 --- a/zcspider/pipelines.py +++ b/zcspider/pipelines.py @@ -4,6 +4,7 @@ # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import os.path from openpyxl import Workbook, load_workbook +from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE # useful for handling different item types with a single interface from scrapy.exceptions import IgnoreRequest @@ -58,6 +59,7 @@ class ZcspiderPipeline: # except: # self.conn.rollback() # raise + item['text'] = ILLEGAL_CHARACTERS_RE.sub(r'', item['text']) line = [item['group'], item['name'], item['domain'], item['url'], item['text']] self.ws.append(line) return item diff --git a/zcspider/settings.py b/zcspider/settings.py index 1736130..9c604b8 100644 --- a/zcspider/settings.py +++ b/zcspider/settings.py @@ -96,7 +96,7 @@ DEFAULT_REQUEST_HEADERS = { REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = 'gb18030' -LOG_LEVEL = 'WARNING' +LOG_LEVEL = 'DEBUG' DOWNLOAD_TIMEOUT = 10 ITEM_PIPELINES = { @@ -110,4 +110,7 @@ FEED_EXPORTERS = { DOWNLOADER_MIDDLEWARES = { 'zcspider.middlewares.FilterHTMLMiddleware': 200, # 其他下载中间件... +} +EXTENSIONS = { + 'scrapy.extensions.telnet.TelnetConsole': None } \ No newline at end of file diff --git a/宣传巡查.exe b/宣传巡查.exe new file mode 100644 index 0000000..dcb6ee2 Binary files /dev/null and b/宣传巡查.exe differ