优化文档导出层级与链接保真,统一正文标题映射并增强 Word 段落超链接处理。

同时移除不再使用的文档后处理依赖,减少汇总导出流程中的冗余步骤。

Made-with: Cursor
This commit is contained in:
Oo
2026-03-30 10:32:34 +08:00
parent d257cbaed3
commit 9e14b56275
3 changed files with 142 additions and 93 deletions

View File

@@ -1,6 +1,7 @@
# 零差云控官网爬虫依赖
requests>=2.28.0
beautifulsoup4>=4.11.0
markdown>=3.6
markdownify>=0.11.0
python-docx>=0.8.11
lxml>=4.9.0

View File

@@ -8,19 +8,20 @@ import time
import copy
import re
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString, Tag
import markdownify
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from urllib.parse import urljoin
from abc import ABC, abstractmethod
from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
from .extract_abstract import generate_abstract
from .post_process import post_process_docx_headings
def _new_doc() -> Document:
@@ -354,6 +355,88 @@ class BaseCrawler(ABC):
img['src'] = full_url
return images_info
def process_links(self, content: BeautifulSoup, page_url: str):
"""
处理内容中的链接,将相对链接转换为绝对链接
Args:
content: 内容区域
page_url: 页面URL用于解析相对路径
"""
for a in content.find_all('a', href=True):
href = a.get('href', '').strip()
if not href:
continue
# 保留电话/邮箱/锚点等特殊链接
if href.startswith(('mailto:', 'tel:', '#', 'javascript:')):
continue
a['href'] = make_absolute_url(page_url, href)
def _add_hyperlink(self, paragraph, text: str, url: str):
"""向段落添加可点击的超链接。"""
if not text or not url:
return
part = paragraph.part
r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
hyperlink = OxmlElement('w:hyperlink')
hyperlink.set(qn('r:id'), r_id)
run = OxmlElement('w:r')
r_pr = OxmlElement('w:rPr')
r_style = OxmlElement('w:rStyle')
r_style.set(qn('w:val'), 'Hyperlink')
r_pr.append(r_style)
run.append(r_pr)
text_element = OxmlElement('w:t')
text_element.text = text
run.append(text_element)
hyperlink.append(run)
paragraph._p.append(hyperlink)
def _append_inline_nodes_to_paragraph(self, paragraph, node):
"""递归写入段落内联内容,保留 a 标签为超链接。"""
if isinstance(node, NavigableString):
text = re.sub(r'\s+', ' ', str(node))
if text:
paragraph.add_run(text)
return
if not isinstance(node, Tag):
return
if node.name == 'br':
paragraph.add_run('\n')
return
if node.name == 'a':
link_text = node.get_text(' ', strip=True)
href = (node.get('href') or '').strip()
if link_text:
display_text = f"{link_text} ({href})" if href else link_text
if paragraph.text.strip():
paragraph.add_run(" ")
if href:
self._add_hyperlink(paragraph, display_text, href)
else:
paragraph.add_run(display_text)
return
for child in node.children:
self._append_inline_nodes_to_paragraph(paragraph, child)
def _add_paragraph_with_links(self, doc: Document, element: Tag, style: str | None = None, prefix: str = ""):
"""添加段落并保留其中超链接。"""
if not element.get_text(strip=True):
return
if style:
paragraph = doc.add_paragraph(style=style)
else:
paragraph = doc.add_paragraph()
if prefix:
paragraph.add_run(prefix)
if element.find('a'):
for child in element.children:
self._append_inline_nodes_to_paragraph(paragraph, child)
else:
paragraph.add_run(element.get_text(strip=True))
def content_to_markdown(self, content: BeautifulSoup, page_title: str = None) -> str:
"""
@@ -397,10 +480,18 @@ class BaseCrawler(ABC):
h2.decompose()
break # 只移除第一个匹配的
# 页面内容中的 h1 降级为 h2与 Word 文档处理一致
# 因为页面标题已经是二级标题(##),所以内容中的 h1 应该降级为二级标题
for h1 in content_copy.find_all('h1'):
h1.name = 'h2'
# 正文标题统一映射:每页正文从 h3 起步,并压缩为连续层级(不跳级
# 例如:
# - 若正文有 h2/h4映射为 h3/h4而不是 h3/h5
# - 若正文有 h1/h3/h6映射为 h3/h4/h5
body_headings = content_copy.find_all(re.compile(r'^h[1-6]$'))
if body_headings:
unique_levels = sorted({int(h.name[1]) for h in body_headings})
level_map = {level: min(i + 3, 6) for i, level in enumerate(unique_levels)}
for heading in body_headings:
original_level = int(heading.name[1])
new_level = level_map[original_level]
heading.name = f'h{new_level}'
return markdownify.markdownify(str(content_copy), heading_style="ATX")
@@ -423,6 +514,11 @@ class BaseCrawler(ABC):
if h1_text == page_title:
first_h1.decompose() # 移除该标签
# 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
heading_elements = content.find_all(re.compile(r'^h[1-6]$'))
unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
# 按文档顺序处理元素,保持列表的连续性
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']):
if element.name == 'img':
@@ -447,16 +543,11 @@ class BaseCrawler(ABC):
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text:
# 对于页面内容中的标题h1 转换为 Heading 2h2-h6 保持原层级
# 因为页面标题已经是 Heading 1所以内容中的 h1 应该降级为 Heading 2
# 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
original_level = int(element.name[1])
if original_level == 1:
# 页面内容中的 h1 转换为 Heading 2
word_level = 2
print(f" 标题层级转换: h1 '{text}' → Heading 2")
else:
# h2-h6 保持原层级h2→Heading 2, h3→Heading 3, ...
word_level = original_level
word_level = level_map.get(original_level, 3)
if word_level != original_level:
print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}")
doc.add_heading(text, level=min(word_level, 9))
elif element.name in ['ul', 'ol']:
@@ -464,37 +555,29 @@ class BaseCrawler(ABC):
continue
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
# 检查父元素是 ul 还是 ol
parent = element.find_parent(['ul', 'ol'])
is_ordered = parent and parent.name == 'ol'
# 使用列表样式
if is_ordered:
doc.add_paragraph(text, style='List Number')
else:
doc.add_paragraph(text, style='List Bullet')
# 检查父元素是 ul 还是 ol
parent = element.find_parent(['ul', 'ol'])
is_ordered = parent and parent.name == 'ol'
if is_ordered:
self._add_paragraph_with_links(doc, element, style='List Number')
else:
self._add_paragraph_with_links(doc, element, style='List Bullet')
elif element.name == 'table':
# 处理表格,创建 Word 表格结构(便于 doc2md.py 解析)
self._add_table_to_docx(doc, element)
elif element.name == 'p':
text = element.get_text(strip=True)
if text:
# 跳过空段落和只包含空白字符的段落
if text.strip():
# 检查是否是列表项(某些网站用 p 标签包裹列表项)
parent = element.find_parent(['ul', 'ol'])
if parent:
is_ordered = parent.name == 'ol'
if is_ordered:
doc.add_paragraph(text, style='List Number')
else:
doc.add_paragraph(text, style='List Bullet')
else:
doc.add_paragraph(text)
# 检查是否是列表项(某些网站用 p 标签包裹列表项)
parent = element.find_parent(['ul', 'ol'])
if parent:
is_ordered = parent.name == 'ol'
if is_ordered:
self._add_paragraph_with_links(doc, element, style='List Number')
else:
self._add_paragraph_with_links(doc, element, style='List Bullet')
else:
self._add_paragraph_with_links(doc, element)
def crawl_page(self, url: str) -> dict | None:
"""
@@ -524,6 +607,8 @@ class BaseCrawler(ABC):
# 处理图片
images = self.process_images(content, url)
# 处理链接(相对链接转绝对链接)
self.process_links(content, url)
# 转换为 Markdown传入标题用于去除重复的h1标签
markdown = self.content_to_markdown(content, title)
@@ -536,35 +621,6 @@ class BaseCrawler(ABC):
"images": images,
}
def save_single_page(self, page_data: dict):
"""
保存单个页面为独立的 md 和 docx 文件
Args:
page_data: 页面数据字典
"""
title = page_data["title"]
safe_title = safe_filename(title)
# 保存 Markdown
md_path = os.path.join(self.output_dir, f"{safe_title}.md")
md_content = f"# {title}\n\n"
md_content += f"**原文链接**: {page_data['url']}\n\n"
md_content += page_data["markdown"]
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_content)
# 保存 Word
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
doc = _new_doc()
doc.add_heading(title, 0)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page_data['url']}").italic = True
self.add_content_to_docx(doc, page_data["content"], self.output_dir, title)
doc.save(docx_path)
def save_combined_documents(self, all_pages: list[dict]):
"""
将所有页面汇总保存为一个 md 和 docx 文件
@@ -718,15 +774,14 @@ class BaseCrawler(ABC):
if new_pages_for_doc:
# 添加新内容
for page in new_pages_for_doc:
doc.add_heading(page["title"], level=1)
# 与汇总 Markdown 保持一致:每页标题使用二级标题(##
doc.add_heading(page["title"], level=2)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page['url']}").italic = True
self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"])
doc.add_page_break()
doc.save(docx_path)
print(f" 追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档")
# 后处理:优化连续标题
post_process_docx_headings(docx_path)
else:
print(f" Word 文档无需更新: {docx_path}")
else:
@@ -752,7 +807,8 @@ class BaseCrawler(ABC):
doc.add_paragraph() # 空行
for page in all_pages:
doc.add_heading(page["title"], level=1)
# 与汇总 Markdown 保持一致:每页标题使用二级标题(##
doc.add_heading(page["title"], level=2)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page['url']}").italic = True
self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"])
@@ -760,8 +816,6 @@ class BaseCrawler(ABC):
doc.save(docx_path)
print(f" 汇总 Word: {docx_path}")
# 后处理:优化连续标题
post_process_docx_headings(docx_path)
def run(self):
"""

View File

@@ -136,6 +136,11 @@ class ProductCrawler(BaseCrawler):
h2.decompose()
break # 只移除第一个匹配的
# 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
heading_elements = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
# 跳过嵌套元素
if element.find_parent(['table', 'li']):
@@ -162,16 +167,11 @@ class ProductCrawler(BaseCrawler):
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text and '零差云控' not in text:
# 对于页面内容中的标题h1 转换为 Heading 2h2-h6 保持原层级
# 因为页面标题已经是 Heading 1所以内容中的 h1 应该降级为 Heading 2
# 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
original_level = int(element.name[1])
if original_level == 1:
# 页面内容中的 h1 转换为 Heading 2
word_level = 2
print(f" 标题层级转换: h1 '{text}' → Heading 2")
else:
# h2-h6 保持原层级h2→Heading 2, h3→Heading 3, ...
word_level = original_level
word_level = level_map.get(original_level, 3)
if word_level != original_level:
print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}")
doc.add_heading(text, level=min(word_level, 9))
elif element.name == 'table':
@@ -179,21 +179,15 @@ class ProductCrawler(BaseCrawler):
self._add_table_to_docx(doc, element)
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(f"{text}")
self._add_paragraph_with_links(doc, element, prefix="")
elif element.name == 'p':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
self._add_paragraph_with_links(doc, element)
elif element.name == 'div':
# 处理特殊的 div 内容块
if element.get('class') and any('param' in c for c in element.get('class', [])):
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
self._add_paragraph_with_links(doc, element)
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
"""