From 901e91d36624858747779393787991786f2961dd Mon Sep 17 00:00:00 2001 From: caoqianming Date: Fri, 25 Aug 2023 16:24:56 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=BF=AE=E6=94=B9web3.py=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=9C=B0=E5=9D=80=E6=9C=89=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- readme.md | 1 + web3.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index bcf2e2d..eb7cb65 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,6 @@ ## 安装告知 务必先卸载本机的chrome浏览器,并安装文件夹下的chrome117.exe +biao.xlsx 为标准对比文件,需保持定期更新 ## 操作说明 用于执行内容巡查, 请按以下步骤依次执行 diff --git a/web3.py b/web3.py index 07d0dde..bad4f66 100644 --- a/web3.py +++ b/web3.py @@ -11,7 +11,8 @@ from mycode.base import BASE_DIR from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir from mycode.crawl_chrome import chrome_main, failed_sites_file - +python_exe = os.path.join(BASE_DIR, 'runtime/python.exe') +scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe') def save_info_to_excel(info_list, output_filename): df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址']) @@ -50,7 +51,7 @@ if __name__ == '__main__': url = row['地址'] domain = urlparse(url).netloc.replace('www.', '') # output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') - cmd = ['./runtime/Scripts/scrapy.exe', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] + cmd = [scrapy_exe, 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] # cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] process = subprocess.Popen(cmd) processes.append(process)