From a877b198f6c4b3df008749ad513084f4a5d75ada Mon Sep 17 00:00:00 2001 From: xiaobulu27 Date: Thu, 24 Aug 2023 10:55:58 +0800 Subject: [PATCH] =?UTF-8?q?web.py=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrape_nj.py => scrape.py | 45 ++++++++++++++++++++++----------------- summary/summary juin.md | 23 -------------------- 2 files changed, 26 insertions(+), 42 deletions(-) rename scrape_nj.py => scrape.py (81%) delete mode 100644 summary/summary juin.md diff --git a/scrape_nj.py b/scrape.py similarity index 81% rename from scrape_nj.py rename to scrape.py index ed7628c..574263f 100644 --- a/scrape_nj.py +++ b/scrape.py @@ -10,6 +10,8 @@ def open_website(url): # Set up Chrome WebDriver with custom User-Agent options = Options() options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36") + prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2} + options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome("./chromedriver.exe", options=options) driver.get(url) return driver @@ -84,26 +86,26 @@ def check_href(href, original_url, visited_pages): parsed_href = urlparse(href) parsed_original_url = urlparse(original_url) # Check if the href leads back to the original page - if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path and parsed_href.fragment == parsed_original_url.fragment: + if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path: return True # Check if the href has already been visited if href in visited_pages: return True return False -def export_to_excel(data): - # Create separate lists for URL, Domain, and Content - domains = [item[0] for item in data] - urls = [item[1] for item in data] - texts = [item[2] for item in data] +def export_to_excel(data, output_filename): + # Create separate lists for each column + groups = [item[0] for item in data] + names = [item[1] for item in data] + domains = [item[2] for item in data] + urls = [item[3] for item in data] + texts = [item[4] for item in data] # Create a DataFrame from the data lists - df = pd.DataFrame({'domain': domains, 'url': urls, 'text': texts}) + df = pd.DataFrame({'group': groups, 'name': names, 'domain': domains, 'url': urls, 'text': texts}) # Export the DataFrame to an Excel file - df.to_excel('output.xlsx', index=False) - - + df.to_excel(output_filename, index=False) def get_cookies_from_previous_session(driver): cookies = {} @@ -128,13 +130,16 @@ def add_cookies(driver, cookies): driver.add_cookie({'name': name, 'value': value}) def main(): - # Starting URL - start_url = 'https://www.cbma.com.cn/' - # Parse the domain from the starting URL - parsed_start_url = urlparse(start_url) - start_domain = parsed_start_url.netloc + # Read failed URLs from the list + df = pd.read_excel('failed_files.xlsx') + + for ind, row in df.iterrows(): + group = row['单位'] # Replace with the actual column name for group + name = row['主办'] + url = row['地址'] + domain = urlparse(url).netloc # Open the website - driver = open_website(start_url) + driver = open_website(url) # Retrieve cookies from previous session cookies = get_cookies_from_previous_session(driver) # Add cookies to the WebDriver @@ -144,9 +149,11 @@ def main(): # Initialize the data list data = [] # Process the starting page and follow hyperlinks recursively - process_page(driver, start_url, visited_pages, start_domain, data) - # Export the data to an Excel file - export_to_excel(data) + process_page(driver, url, visited_pages, domain, data) + # Export data to a separate Excel file for each URL + output_filename = f'web_dir/{name}_{domain}.xlsx' + export_to_excel(data, output_filename) + # Close the WebDriver driver.quit() diff --git a/summary/summary juin.md b/summary/summary juin.md deleted file mode 100644 index dd6b02b..0000000 --- a/summary/summary juin.md +++ /dev/null @@ -1,23 +0,0 @@ -# 项目总结2023.6 - -爬取时间为6月5-9日, 分析时间为6月12-13 日 - -## 涉及单位 - -目前爬取并分析了阵地类型为官方网站、微信公众号两种类型的数据,其中官网网站69个(成功爬取63个),微信公众号102个。 -其中官方网站爬取了同域名下所有链接地址,微信公众号爬取了历史所有文章。 - -## 分析结果 - -根据分析要求进行得到分析结果,官方网站共发现错误187处,公众号共发现错误39处。具体见结果表 - -## 存在问题 - -目前存在部分网站因各种原因,未获取到数据,见下表 -| 单位 | 可能原因 | -| ---- | ---- | -| 中国建筑材料科学研究总院有限公司_http://www.cbma.com | 不能访问 | -| 中国建材检验认证集团江苏有限公司_http://www.ctcjs.com | 不能访问 | -| 乌鲁木齐京诚检测技术有限公司_http://www.wlmqjc.cn/ | 网站域名过期 | -| 中材江西电瓷电气有限公司_http://www.sinoma-insulator.com | 不能访问 | -| 中国新型建材设计研究院有限公司_http://www.cnhdi.com/ | 不能访问 | \ No newline at end of file