web.py修改
This commit is contained in:
parent
a395ab6867
commit
a877b198f6
|
@ -10,6 +10,8 @@ def open_website(url):
|
||||||
# Set up Chrome WebDriver with custom User-Agent
|
# Set up Chrome WebDriver with custom User-Agent
|
||||||
options = Options()
|
options = Options()
|
||||||
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
||||||
|
prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
|
||||||
|
options.add_experimental_option("prefs", prefs)
|
||||||
driver = webdriver.Chrome("./chromedriver.exe", options=options)
|
driver = webdriver.Chrome("./chromedriver.exe", options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
return driver
|
return driver
|
||||||
|
@ -84,26 +86,26 @@ def check_href(href, original_url, visited_pages):
|
||||||
parsed_href = urlparse(href)
|
parsed_href = urlparse(href)
|
||||||
parsed_original_url = urlparse(original_url)
|
parsed_original_url = urlparse(original_url)
|
||||||
# Check if the href leads back to the original page
|
# Check if the href leads back to the original page
|
||||||
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path and parsed_href.fragment == parsed_original_url.fragment:
|
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
|
||||||
return True
|
return True
|
||||||
# Check if the href has already been visited
|
# Check if the href has already been visited
|
||||||
if href in visited_pages:
|
if href in visited_pages:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def export_to_excel(data):
|
def export_to_excel(data, output_filename):
|
||||||
# Create separate lists for URL, Domain, and Content
|
# Create separate lists for each column
|
||||||
domains = [item[0] for item in data]
|
groups = [item[0] for item in data]
|
||||||
urls = [item[1] for item in data]
|
names = [item[1] for item in data]
|
||||||
texts = [item[2] for item in data]
|
domains = [item[2] for item in data]
|
||||||
|
urls = [item[3] for item in data]
|
||||||
|
texts = [item[4] for item in data]
|
||||||
|
|
||||||
# Create a DataFrame from the data lists
|
# Create a DataFrame from the data lists
|
||||||
df = pd.DataFrame({'domain': domains, 'url': urls, 'text': texts})
|
df = pd.DataFrame({'group': groups, 'name': names, 'domain': domains, 'url': urls, 'text': texts})
|
||||||
|
|
||||||
# Export the DataFrame to an Excel file
|
# Export the DataFrame to an Excel file
|
||||||
df.to_excel('output.xlsx', index=False)
|
df.to_excel(output_filename, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_cookies_from_previous_session(driver):
|
def get_cookies_from_previous_session(driver):
|
||||||
cookies = {}
|
cookies = {}
|
||||||
|
@ -128,13 +130,16 @@ def add_cookies(driver, cookies):
|
||||||
driver.add_cookie({'name': name, 'value': value})
|
driver.add_cookie({'name': name, 'value': value})
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Starting URL
|
# Read failed URLs from the list
|
||||||
start_url = 'https://www.cbma.com.cn/'
|
df = pd.read_excel('failed_files.xlsx')
|
||||||
# Parse the domain from the starting URL
|
|
||||||
parsed_start_url = urlparse(start_url)
|
for ind, row in df.iterrows():
|
||||||
start_domain = parsed_start_url.netloc
|
group = row['单位'] # Replace with the actual column name for group
|
||||||
|
name = row['主办']
|
||||||
|
url = row['地址']
|
||||||
|
domain = urlparse(url).netloc
|
||||||
# Open the website
|
# Open the website
|
||||||
driver = open_website(start_url)
|
driver = open_website(url)
|
||||||
# Retrieve cookies from previous session
|
# Retrieve cookies from previous session
|
||||||
cookies = get_cookies_from_previous_session(driver)
|
cookies = get_cookies_from_previous_session(driver)
|
||||||
# Add cookies to the WebDriver
|
# Add cookies to the WebDriver
|
||||||
|
@ -144,9 +149,11 @@ def main():
|
||||||
# Initialize the data list
|
# Initialize the data list
|
||||||
data = []
|
data = []
|
||||||
# Process the starting page and follow hyperlinks recursively
|
# Process the starting page and follow hyperlinks recursively
|
||||||
process_page(driver, start_url, visited_pages, start_domain, data)
|
process_page(driver, url, visited_pages, domain, data)
|
||||||
# Export the data to an Excel file
|
# Export data to a separate Excel file for each URL
|
||||||
export_to_excel(data)
|
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
||||||
|
export_to_excel(data, output_filename)
|
||||||
|
|
||||||
# Close the WebDriver
|
# Close the WebDriver
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
|
@ -1,23 +0,0 @@
|
||||||
# 项目总结2023.6
|
|
||||||
|
|
||||||
爬取时间为6月5-9日, 分析时间为6月12-13 日
|
|
||||||
|
|
||||||
## 涉及单位
|
|
||||||
|
|
||||||
目前爬取并分析了阵地类型为官方网站、微信公众号两种类型的数据,其中官网网站69个(成功爬取63个),微信公众号102个。
|
|
||||||
其中官方网站爬取了同域名下所有链接地址,微信公众号爬取了历史所有文章。
|
|
||||||
|
|
||||||
## 分析结果
|
|
||||||
|
|
||||||
根据分析要求进行得到分析结果,官方网站共发现错误187处,公众号共发现错误39处。具体见结果表
|
|
||||||
|
|
||||||
## 存在问题
|
|
||||||
|
|
||||||
目前存在部分网站因各种原因,未获取到数据,见下表
|
|
||||||
| 单位 | 可能原因 |
|
|
||||||
| ---- | ---- |
|
|
||||||
| 中国建筑材料科学研究总院有限公司_http://www.cbma.com | 不能访问 |
|
|
||||||
| 中国建材检验认证集团江苏有限公司_http://www.ctcjs.com | 不能访问 |
|
|
||||||
| 乌鲁木齐京诚检测技术有限公司_http://www.wlmqjc.cn/ | 网站域名过期 |
|
|
||||||
| 中材江西电瓷电气有限公司_http://www.sinoma-insulator.com | 不能访问 |
|
|
||||||
| 中国新型建材设计研究院有限公司_http://www.cnhdi.com/ | 不能访问 |
|
|
Loading…
Reference in New Issue