优化文档导出层级与链接保真,统一正文标题映射并增强 Word 段落超链接处理。

同时移除不再使用的文档后处理依赖,减少汇总导出流程中的冗余步骤。

Made-with: Cursor
This commit is contained in:
Oo
2026-03-30 10:32:34 +08:00
parent d257cbaed3
commit 9e14b56275
3 changed files with 142 additions and 93 deletions

View File

@@ -136,6 +136,11 @@ class ProductCrawler(BaseCrawler):
h2.decompose()
break # 只移除第一个匹配的
# 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
heading_elements = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
# 跳过嵌套元素
if element.find_parent(['table', 'li']):
@@ -162,16 +167,11 @@ class ProductCrawler(BaseCrawler):
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text and '零差云控' not in text:
# 对于页面内容中的标题h1 转换为 Heading 2h2-h6 保持原层级
# 因为页面标题已经是 Heading 1所以内容中的 h1 应该降级为 Heading 2
# 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
original_level = int(element.name[1])
if original_level == 1:
# 页面内容中的 h1 转换为 Heading 2
word_level = 2
print(f" 标题层级转换: h1 '{text}' → Heading 2")
else:
# h2-h6 保持原层级h2→Heading 2, h3→Heading 3, ...
word_level = original_level
word_level = level_map.get(original_level, 3)
if word_level != original_level:
print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}")
doc.add_heading(text, level=min(word_level, 9))
elif element.name == 'table':
@@ -179,21 +179,15 @@ class ProductCrawler(BaseCrawler):
self._add_table_to_docx(doc, element)
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(f"{text}")
self._add_paragraph_with_links(doc, element, prefix="")
elif element.name == 'p':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
self._add_paragraph_with_links(doc, element)
elif element.name == 'div':
# 处理特殊的 div 内容块
if element.get('class') and any('param' in c for c in element.get('class', [])):
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
self._add_paragraph_with_links(doc, element)
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
"""