更新爬虫方案文档,增加服务与支持-详细页面的输出信息;优化基础爬虫类,增强标题提取和内容去重逻辑;根据doc2md.py调整图片处理逻辑以改善Word文档生成效果。

This commit is contained in:
oy2020
2026-01-31 09:30:33 +08:00
parent 3670129972
commit 3c625d1c3a
5 changed files with 332 additions and 58 deletions

View File

@@ -77,16 +77,22 @@ class ProductCrawler(BaseCrawler):
if os.path.exists(local_path):
try:
# 图片前添加空行
doc.add_paragraph()
doc.add_picture(local_path, width=Inches(4.5))
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
# 图片后添加空行
doc.add_paragraph()
except Exception as e:
print(f" Word插入图片失败: {local_path} - {e}")
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text and '零差云控' not in text:
# HTML h1-h6 直接映射到 Word Heading 1-6
# 限制在 1-9 范围内Word 支持的最大标题级别)
level = int(element.name[1])
doc.add_heading(text, level=min(level + 1, 9))
doc.add_heading(text, level=min(level, 9))
elif element.name == 'table':
# 处理表格