feat: 确保pdf下载完整
This commit is contained in:
parent
b9b469f917
commit
352966946e
|
|
@ -45,7 +45,12 @@ async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool,
|
|||
async def on_response(response):
|
||||
nonlocal pdf_content
|
||||
if "application/pdf" in response.headers.get("content-type", ""):
|
||||
try:
|
||||
# 确保完全读取响应体
|
||||
pdf_content = await response.body()
|
||||
print(f"✓ 成功捕获 PDF,大小: {len(pdf_content)} bytes")
|
||||
except Exception as e:
|
||||
print(f"⚠ 读取 PDF 响应体失败: {e}")
|
||||
|
||||
page.on("response", on_response)
|
||||
|
||||
|
|
@ -92,16 +97,26 @@ async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool,
|
|||
if not pdf_content:
|
||||
print("等待 PDF 响应...")
|
||||
try:
|
||||
await page.wait_for_response(
|
||||
response = await page.wait_for_response(
|
||||
lambda response: "application/pdf" in response.headers.get("content-type", ""),
|
||||
timeout=15000
|
||||
)
|
||||
# 确保响应体完全加载
|
||||
pdf_content = await response.body()
|
||||
print(f"✓ 通过 wait_for_response 获取 PDF,大小: {len(pdf_content)} bytes")
|
||||
except Exception as e:
|
||||
print(f"⚠ 等待 PDF 响应超时: {e}")
|
||||
|
||||
if pdf_content:
|
||||
# 验证文件大小(PDF 通常大于 10KB)
|
||||
pdf_size = len(pdf_content)
|
||||
if pdf_size < 10240:
|
||||
await browser.close()
|
||||
return False, f"PDF 文件过小: {pdf_size} bytes,可能下载不完整"
|
||||
|
||||
with open(save_path, "wb") as f:
|
||||
f.write(pdf_content)
|
||||
print(f"✓ PDF 已保存到: {save_path},大小: {pdf_size} bytes")
|
||||
await browser.close()
|
||||
return True, ""
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue