feat: 确保pdf下载完整

This commit is contained in:
caoqianming 2026-02-10 11:14:30 +08:00
parent b9b469f917
commit 352966946e
1 changed files with 17 additions and 2 deletions

View File

@ -45,7 +45,12 @@ async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool,
async def on_response(response): async def on_response(response):
nonlocal pdf_content nonlocal pdf_content
if "application/pdf" in response.headers.get("content-type", ""): if "application/pdf" in response.headers.get("content-type", ""):
try:
# 确保完全读取响应体
pdf_content = await response.body() pdf_content = await response.body()
print(f"✓ 成功捕获 PDF大小: {len(pdf_content)} bytes")
except Exception as e:
print(f"⚠ 读取 PDF 响应体失败: {e}")
page.on("response", on_response) page.on("response", on_response)
@ -92,16 +97,26 @@ async def download_from_url_playwright(url: str, save_path: str) -> tuple[bool,
if not pdf_content: if not pdf_content:
print("等待 PDF 响应...") print("等待 PDF 响应...")
try: try:
await page.wait_for_response( response = await page.wait_for_response(
lambda response: "application/pdf" in response.headers.get("content-type", ""), lambda response: "application/pdf" in response.headers.get("content-type", ""),
timeout=15000 timeout=15000
) )
# 确保响应体完全加载
pdf_content = await response.body()
print(f"✓ 通过 wait_for_response 获取 PDF大小: {len(pdf_content)} bytes")
except Exception as e: except Exception as e:
print(f"⚠ 等待 PDF 响应超时: {e}") print(f"⚠ 等待 PDF 响应超时: {e}")
if pdf_content: if pdf_content:
# 验证文件大小PDF 通常大于 10KB
pdf_size = len(pdf_content)
if pdf_size < 10240:
await browser.close()
return False, f"PDF 文件过小: {pdf_size} bytes可能下载不完整"
with open(save_path, "wb") as f: with open(save_path, "wb") as f:
f.write(pdf_content) f.write(pdf_content)
print(f"✓ PDF 已保存到: {save_path},大小: {pdf_size} bytes")
await browser.close() await browser.close()
return True, "" return True, ""
else: else: