feat: 增加chrom_main_from_list,不保存failed_sites
This commit is contained in:
parent
8888e19fc3
commit
54019e9e5d
|
@ -133,6 +133,36 @@ def add_cookies(driver, cookies):
|
||||||
for name, value in cookies.items():
|
for name, value in cookies.items():
|
||||||
driver.add_cookie({'name': name, 'value': value})
|
driver.add_cookie({'name': name, 'value': value})
|
||||||
|
|
||||||
|
def chrom_main_from_list(sites):
|
||||||
|
for ind, item in enumerate(sites):
|
||||||
|
group = item[0] # Replace with the actual column name for group
|
||||||
|
name = item[1]
|
||||||
|
url = item[2]
|
||||||
|
domain = urlparse(url).netloc.replace("www.", "")
|
||||||
|
|
||||||
|
# Open the website
|
||||||
|
driver = open_website(url)
|
||||||
|
|
||||||
|
# Retrieve cookies from previous session
|
||||||
|
cookies = get_cookies_from_previous_session(driver)
|
||||||
|
# Add cookies to the WebDriver
|
||||||
|
add_cookies(driver, cookies)
|
||||||
|
|
||||||
|
# Initialize the set to store visited pages
|
||||||
|
visited_pages = set()
|
||||||
|
# Initialize the data list
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Process the starting page and follow hyperlinks recursively
|
||||||
|
process_page(driver, url, visited_pages, domain, data)
|
||||||
|
|
||||||
|
# Export data to a separate Excel file in the web_dir directory
|
||||||
|
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
|
export_to_excel(data, output_filename)
|
||||||
|
|
||||||
|
# Close the WebDriver
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
def chrome_main():
|
def chrome_main():
|
||||||
# Read failed URLs from the list
|
# Read failed URLs from the list
|
||||||
df = pd.read_excel(failed_sites_file)
|
df = pd.read_excel(failed_sites_file)
|
||||||
|
|
|
@ -1,12 +1,19 @@
|
||||||
## 安装告知
|
## 安装告知
|
||||||
务必先卸载本机的chrome浏览器,并安装文件夹下的chrome117.exe
|
务必先卸载本机的chrome浏览器,并安装文件夹下的chrome117.exe
|
||||||
|
|
||||||
|
务必确保chrome不自动更新 操作方法:<br/>
|
||||||
|
修改 "C:\Windows\System32\drivers\etc\host 文件,在最后增加
|
||||||
|
127.0.0.1 update.googleapis.com<br/>
|
||||||
|
保存后 win+r 快捷键,输入cmd打开命令行,输入"ipconfig /flushdns" 刷新dns
|
||||||
|
|
||||||
biao.xlsx 为标准对比文件,需保持定期更新
|
biao.xlsx 为标准对比文件,需保持定期更新
|
||||||
|
|
||||||
|
|
||||||
## 操作说明
|
## 操作说明
|
||||||
用于执行内容巡查, 请按以下步骤依次执行
|
用于执行内容巡查, 请按以下步骤依次执行
|
||||||
|
|
||||||
### 1.爬取公众号内容
|
### 1.爬取公众号内容
|
||||||
运行wechat.exe, 打开网页后按步骤操作,具体可参考视频教学 [https://www.bilibili.com/video/BV1kh411c7GX/]
|
运行wechat.exe, 打开网页后按步骤操作,具体可参考视频教学 https://www.bilibili.com/video/BV1kh411c7GX/
|
||||||
|
|
||||||
### 2.整理需要爬取的官网
|
### 2.整理需要爬取的官网
|
||||||
打开web_sites.xlsx,进行编辑后保存并关闭
|
打开web_sites.xlsx,进行编辑后保存并关闭
|
||||||
|
|
7
web3.py
7
web3.py
|
@ -9,7 +9,7 @@ from openpyxl import load_workbook
|
||||||
|
|
||||||
from mycode.base import BASE_DIR
|
from mycode.base import BASE_DIR
|
||||||
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
||||||
from mycode.crawl_chrome import chrome_main, failed_sites_file
|
from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file
|
||||||
|
|
||||||
python_exe = os.path.join(BASE_DIR, 'runtime/python.exe')
|
python_exe = os.path.join(BASE_DIR, 'runtime/python.exe')
|
||||||
# scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe')
|
# scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe')
|
||||||
|
@ -76,9 +76,8 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
if info_to_save:
|
if info_to_save:
|
||||||
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
|
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
|
||||||
save_info_to_excel(info_to_save, failed_sites_file)
|
chrom_main_from_list(info_to_save)
|
||||||
chrome_main()
|
# os.remove(failed_sites_file)
|
||||||
os.remove(failed_sites_file)
|
|
||||||
|
|
||||||
print('网站爬取完毕!')
|
print('网站爬取完毕!')
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ class BaseSpider(scrapy.Spider):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.group = group
|
self.group = group
|
||||||
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
|
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
|
||||||
print(f"爬取开始: {name}-{domain}")
|
print(f"爬取开始: {name}_{domain}")
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for url in self.start_urls:
|
for url in self.start_urls:
|
||||||
|
@ -83,4 +83,4 @@ class BaseSpider(scrapy.Spider):
|
||||||
|
|
||||||
def closed(self, reason):
|
def closed(self, reason):
|
||||||
# This method will be called when the Spider is about to close
|
# This method will be called when the Spider is about to close
|
||||||
print(f'爬取完成: {self.name}-{self.domain}')
|
print(f'爬取完成: {self.name}_{self.domain}')
|
Loading…
Reference in New Issue