更新爬虫方案文档,增加服务与支持-详细页面的输出信息;优化基础爬虫类,增强标题提取和内容去重逻辑;根据doc2md.py调整图片处理逻辑以改善Word文档生成效果。
This commit is contained in:
@@ -77,16 +77,22 @@ class ProductCrawler(BaseCrawler):
|
||||
|
||||
if os.path.exists(local_path):
|
||||
try:
|
||||
# 图片前添加空行
|
||||
doc.add_paragraph()
|
||||
doc.add_picture(local_path, width=Inches(4.5))
|
||||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
# 图片后添加空行
|
||||
doc.add_paragraph()
|
||||
except Exception as e:
|
||||
print(f" Word插入图片失败: {local_path} - {e}")
|
||||
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text and '零差云控' not in text:
|
||||
# HTML h1-h6 直接映射到 Word Heading 1-6
|
||||
# 限制在 1-9 范围内(Word 支持的最大标题级别)
|
||||
level = int(element.name[1])
|
||||
doc.add_heading(text, level=min(level + 1, 9))
|
||||
doc.add_heading(text, level=min(level, 9))
|
||||
|
||||
elif element.name == 'table':
|
||||
# 处理表格
|
||||
|
||||
Reference in New Issue
Block a user