优化文档导出层级与链接保真,统一正文标题映射并增强 Word 段落超链接处理。
同时移除不再使用的文档后处理依赖,减少汇总导出流程中的冗余步骤。 Made-with: Cursor
This commit is contained in:
@@ -136,6 +136,11 @@ class ProductCrawler(BaseCrawler):
|
||||
h2.decompose()
|
||||
break # 只移除第一个匹配的
|
||||
|
||||
# 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
|
||||
heading_elements = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
||||
unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
|
||||
level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
|
||||
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
|
||||
# 跳过嵌套元素
|
||||
if element.find_parent(['table', 'li']):
|
||||
@@ -162,16 +167,11 @@ class ProductCrawler(BaseCrawler):
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text and '零差云控' not in text:
|
||||
# 对于页面内容中的标题,h1 转换为 Heading 2,h2-h6 保持原层级
|
||||
# 因为页面标题已经是 Heading 1,所以内容中的 h1 应该降级为 Heading 2
|
||||
# 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
|
||||
original_level = int(element.name[1])
|
||||
if original_level == 1:
|
||||
# 页面内容中的 h1 转换为 Heading 2
|
||||
word_level = 2
|
||||
print(f" 标题层级转换: h1 '{text}' → Heading 2")
|
||||
else:
|
||||
# h2-h6 保持原层级(h2→Heading 2, h3→Heading 3, ...)
|
||||
word_level = original_level
|
||||
word_level = level_map.get(original_level, 3)
|
||||
if word_level != original_level:
|
||||
print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}")
|
||||
doc.add_heading(text, level=min(word_level, 9))
|
||||
|
||||
elif element.name == 'table':
|
||||
@@ -179,21 +179,15 @@ class ProductCrawler(BaseCrawler):
|
||||
self._add_table_to_docx(doc, element)
|
||||
|
||||
elif element.name == 'li':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(f"• {text}")
|
||||
self._add_paragraph_with_links(doc, element, prefix="• ")
|
||||
|
||||
elif element.name == 'p':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
self._add_paragraph_with_links(doc, element)
|
||||
|
||||
elif element.name == 'div':
|
||||
# 处理特殊的 div 内容块
|
||||
if element.get('class') and any('param' in c for c in element.get('class', [])):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
self._add_paragraph_with_links(doc, element)
|
||||
|
||||
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user