优化文档导出层级与链接保真，统一正文标题映射并增强 Word 段落超链接处理。

同时移除不再使用的文档后处理依赖，减少汇总导出流程中的冗余步骤。 Made-with: Cursor
2026-03-30 10:32:34 +08:00
parent d257cbaed3
commit 9e14b56275
3 changed files with 142 additions and 93 deletions
--- a/zeroerr_crawler/base_crawler.py
+++ b/zeroerr_crawler/base_crawler.py
@@ -8,19 +8,20 @@ import time
 import copy
 import re
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString, Tag
 import markdownify
 from docx import Document
 from docx.shared import Inches, Pt
 from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
 from urllib.parse import urljoin
 from abc import ABC, abstractmethod

 from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
 from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
 from .extract_abstract import generate_abstract
-from .post_process import post_process_docx_headings


 def _new_doc() -> Document:
@@ -354,6 +355,88 @@ class BaseCrawler(ABC):
                img['src'] = full_url
        
        return images_info
+
+    def process_links(self, content: BeautifulSoup, page_url: str):
+        """
+        处理内容中的链接，将相对链接转换为绝对链接
+
+        Args:
+            content: 内容区域
+            page_url: 页面URL（用于解析相对路径）
+        """
+        for a in content.find_all('a', href=True):
+            href = a.get('href', '').strip()
+            if not href:
+                continue
+            # 保留电话/邮箱/锚点等特殊链接
+            if href.startswith(('mailto:', 'tel:', '#', 'javascript:')):
+                continue
+            a['href'] = make_absolute_url(page_url, href)
+
+    def _add_hyperlink(self, paragraph, text: str, url: str):
+        """向段落添加可点击的超链接。"""
+        if not text or not url:
+            return
+        part = paragraph.part
+        r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
+        hyperlink = OxmlElement('w:hyperlink')
+        hyperlink.set(qn('r:id'), r_id)
+
+        run = OxmlElement('w:r')
+        r_pr = OxmlElement('w:rPr')
+        r_style = OxmlElement('w:rStyle')
+        r_style.set(qn('w:val'), 'Hyperlink')
+        r_pr.append(r_style)
+        run.append(r_pr)
+
+        text_element = OxmlElement('w:t')
+        text_element.text = text
+        run.append(text_element)
+        hyperlink.append(run)
+        paragraph._p.append(hyperlink)
+
+    def _append_inline_nodes_to_paragraph(self, paragraph, node):
+        """递归写入段落内联内容，保留 a 标签为超链接。"""
+        if isinstance(node, NavigableString):
+            text = re.sub(r'\s+', ' ', str(node))
+            if text:
+                paragraph.add_run(text)
+            return
+        if not isinstance(node, Tag):
+            return
+        if node.name == 'br':
+            paragraph.add_run('\n')
+            return
+        if node.name == 'a':
+            link_text = node.get_text(' ', strip=True)
+            href = (node.get('href') or '').strip()
+            if link_text:
+                display_text = f"{link_text} ({href})" if href else link_text
+                if paragraph.text.strip():
+                    paragraph.add_run(" ")
+                if href:
+                    self._add_hyperlink(paragraph, display_text, href)
+                else:
+                    paragraph.add_run(display_text)
+            return
+        for child in node.children:
+            self._append_inline_nodes_to_paragraph(paragraph, child)
+
+    def _add_paragraph_with_links(self, doc: Document, element: Tag, style: str | None = None, prefix: str = ""):
+        """添加段落并保留其中超链接。"""
+        if not element.get_text(strip=True):
+            return
+        if style:
+            paragraph = doc.add_paragraph(style=style)
+        else:
+            paragraph = doc.add_paragraph()
+        if prefix:
+            paragraph.add_run(prefix)
+        if element.find('a'):
+            for child in element.children:
+                self._append_inline_nodes_to_paragraph(paragraph, child)
+        else:
+            paragraph.add_run(element.get_text(strip=True))
    
    def content_to_markdown(self, content: BeautifulSoup, page_title: str = None) -> str:
        """
@@ -397,10 +480,18 @@ class BaseCrawler(ABC):
                            h2.decompose()
                            break  # 只移除第一个匹配的
        
-        # 页面内容中的 h1 降级为 h2（与 Word 文档处理一致）
-        # 因为页面标题已经是二级标题（##），所以内容中的 h1 应该降级为二级标题
-        for h1 in content_copy.find_all('h1'):
-            h1.name = 'h2'
+        # 正文标题统一映射：每页正文从 h3 起步，并压缩为连续层级（不跳级）
+        # 例如：
+        # - 若正文有 h2/h4：映射为 h3/h4（而不是 h3/h5）
+        # - 若正文有 h1/h3/h6：映射为 h3/h4/h5
+        body_headings = content_copy.find_all(re.compile(r'^h[1-6]$'))
+        if body_headings:
+            unique_levels = sorted({int(h.name[1]) for h in body_headings})
+            level_map = {level: min(i + 3, 6) for i, level in enumerate(unique_levels)}
+            for heading in body_headings:
+                original_level = int(heading.name[1])
+                new_level = level_map[original_level]
+                heading.name = f'h{new_level}'
        
        return markdownify.markdownify(str(content_copy), heading_style="ATX")
    
@@ -423,6 +514,11 @@ class BaseCrawler(ABC):
                if h1_text == page_title:
                    first_h1.decompose()  # 移除该标签
        
+        # 计算正文标题映射：每页从 Heading 3 起步，并压缩为连续层级（不跳级）
+        heading_elements = content.find_all(re.compile(r'^h[1-6]$'))
+        unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
+        level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
+
        # 按文档顺序处理元素，保持列表的连续性
        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']):
            if element.name == 'img':
@@ -447,16 +543,11 @@ class BaseCrawler(ABC):
            elif element.name.startswith('h'):
                text = element.get_text(strip=True)
                if text:
-                    # 对于页面内容中的标题，h1 转换为 Heading 2，h2-h6 保持原层级
-                    # 因为页面标题已经是 Heading 1，所以内容中的 h1 应该降级为 Heading 2
+                    # 正文标题统一映射：每页从 Heading 3 起步，并压缩为连续层级（不跳级）
                    original_level = int(element.name[1])
-                    if original_level == 1:
-                        # 页面内容中的 h1 转换为 Heading 2
-                        word_level = 2
-                        print(f"    标题层级转换: h1 '{text}' → Heading 2")
-                    else:
-                        # h2-h6 保持原层级（h2→Heading 2, h3→Heading 3, ...）
-                        word_level = original_level
+                    word_level = level_map.get(original_level, 3)
+                    if word_level != original_level:
+                        print(f"    标题层级转换: h{original_level} '{text}' → Heading {word_level}")
                    doc.add_heading(text, level=min(word_level, 9))
                
            elif element.name in ['ul', 'ol']:
@@ -464,37 +555,29 @@ class BaseCrawler(ABC):
                continue
                
            elif element.name == 'li':
-                text = element.get_text(strip=True)
-                if text:
-                    # 检查父元素是 ul 还是 ol
-                    parent = element.find_parent(['ul', 'ol'])
-                    is_ordered = parent and parent.name == 'ol'
-                    
-                    # 使用列表样式
-                    if is_ordered:
-                        doc.add_paragraph(text, style='List Number')
-                    else:
-                        doc.add_paragraph(text, style='List Bullet')
+                # 检查父元素是 ul 还是 ol
+                parent = element.find_parent(['ul', 'ol'])
+                is_ordered = parent and parent.name == 'ol'
+                if is_ordered:
+                    self._add_paragraph_with_links(doc, element, style='List Number')
+                else:
+                    self._add_paragraph_with_links(doc, element, style='List Bullet')
                
            elif element.name == 'table':
                # 处理表格，创建 Word 表格结构（便于 doc2md.py 解析）
                self._add_table_to_docx(doc, element)
                
            elif element.name == 'p':
-                text = element.get_text(strip=True)
-                if text:
-                    # 跳过空段落和只包含空白字符的段落
-                    if text.strip():
-                        # 检查是否是列表项（某些网站用 p 标签包裹列表项）
-                        parent = element.find_parent(['ul', 'ol'])
-                        if parent:
-                            is_ordered = parent.name == 'ol'
-                            if is_ordered:
-                                doc.add_paragraph(text, style='List Number')
-                            else:
-                                doc.add_paragraph(text, style='List Bullet')
-                        else:
-                            doc.add_paragraph(text)
+                # 检查是否是列表项（某些网站用 p 标签包裹列表项）
+                parent = element.find_parent(['ul', 'ol'])
+                if parent:
+                    is_ordered = parent.name == 'ol'
+                    if is_ordered:
+                        self._add_paragraph_with_links(doc, element, style='List Number')
+                    else:
+                        self._add_paragraph_with_links(doc, element, style='List Bullet')
+                else:
+                    self._add_paragraph_with_links(doc, element)
    
    def crawl_page(self, url: str) -> dict | None:
        """
@@ -524,6 +607,8 @@ class BaseCrawler(ABC):
        
        # 处理图片
        images = self.process_images(content, url)
+        # 处理链接（相对链接转绝对链接）
+        self.process_links(content, url)
        
        # 转换为 Markdown（传入标题，用于去除重复的h1标签）
        markdown = self.content_to_markdown(content, title)
@@ -536,35 +621,6 @@ class BaseCrawler(ABC):
            "images": images,
        }
    
-    def save_single_page(self, page_data: dict):
-        """
-        保存单个页面为独立的 md 和 docx 文件
-        
-        Args:
-            page_data: 页面数据字典
-        """
-        title = page_data["title"]
-        safe_title = safe_filename(title)
-        
-        # 保存 Markdown
-        md_path = os.path.join(self.output_dir, f"{safe_title}.md")
-        md_content = f"# {title}\n\n"
-        md_content += f"**原文链接**: {page_data['url']}\n\n"
-        md_content += page_data["markdown"]
-        
-        with open(md_path, "w", encoding="utf-8") as f:
-            f.write(md_content)
-        
-        # 保存 Word
-        docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
-        doc = _new_doc()
-        doc.add_heading(title, 0)
-        p = doc.add_paragraph()
-        p.add_run(f"原文链接: {page_data['url']}").italic = True
-        
-        self.add_content_to_docx(doc, page_data["content"], self.output_dir, title)
-        doc.save(docx_path)
-    
    def save_combined_documents(self, all_pages: list[dict]):
        """
        将所有页面汇总保存为一个 md 和 docx 文件
@@ -718,15 +774,14 @@ class BaseCrawler(ABC):
            if new_pages_for_doc:
                # 添加新内容
                for page in new_pages_for_doc:
-                    doc.add_heading(page["title"], level=1)
+                    # 与汇总 Markdown 保持一致：每页标题使用二级标题（##）
+                    doc.add_heading(page["title"], level=2)
                    p = doc.add_paragraph()
                    p.add_run(f"原文链接: {page['url']}").italic = True
                    self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"])
                    doc.add_page_break()
                doc.save(docx_path)
                print(f"  追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档")
-                # 后处理：优化连续标题
-                post_process_docx_headings(docx_path)
            else:
                print(f"  Word 文档无需更新: {docx_path}")
        else:
@@ -752,7 +807,8 @@ class BaseCrawler(ABC):
                    doc.add_paragraph()  # 空行
            
            for page in all_pages:
-                doc.add_heading(page["title"], level=1)
+                # 与汇总 Markdown 保持一致：每页标题使用二级标题（##）
+                doc.add_heading(page["title"], level=2)
                p = doc.add_paragraph()
                p.add_run(f"原文链接: {page['url']}").italic = True
                self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"])
@@ -760,8 +816,6 @@ class BaseCrawler(ABC):
            
            doc.save(docx_path)
            print(f"  汇总 Word: {docx_path}")
-            # 后处理：优化连续标题
-            post_process_docx_headings(docx_path)
    
    def run(self):
        """
--- a/zeroerr_crawler/product_crawler.py
+++ b/zeroerr_crawler/product_crawler.py
@@ -136,6 +136,11 @@ class ProductCrawler(BaseCrawler):
                            h2.decompose()
                            break  # 只移除第一个匹配的
        
+        # 计算正文标题映射：每页从 Heading 3 起步，并压缩为连续层级（不跳级）
+        heading_elements = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
+        unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
+        level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
+
        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
            # 跳过嵌套元素
            if element.find_parent(['table', 'li']):
@@ -162,16 +167,11 @@ class ProductCrawler(BaseCrawler):
            elif element.name.startswith('h'):
                text = element.get_text(strip=True)
                if text and '零差云控' not in text:
-                    # 对于页面内容中的标题，h1 转换为 Heading 2，h2-h6 保持原层级
-                    # 因为页面标题已经是 Heading 1，所以内容中的 h1 应该降级为 Heading 2
+                    # 正文标题统一映射：每页从 Heading 3 起步，并压缩为连续层级（不跳级）
                    original_level = int(element.name[1])
-                    if original_level == 1:
-                        # 页面内容中的 h1 转换为 Heading 2
-                        word_level = 2
-                        print(f"    标题层级转换: h1 '{text}' → Heading 2")
-                    else:
-                        # h2-h6 保持原层级（h2→Heading 2, h3→Heading 3, ...）
-                        word_level = original_level
+                    word_level = level_map.get(original_level, 3)
+                    if word_level != original_level:
+                        print(f"    标题层级转换: h{original_level} '{text}' → Heading {word_level}")
                    doc.add_heading(text, level=min(word_level, 9))
            
            elif element.name == 'table':
@@ -179,21 +179,15 @@ class ProductCrawler(BaseCrawler):
                self._add_table_to_docx(doc, element)
            
            elif element.name == 'li':
-                text = element.get_text(strip=True)
-                if text:
-                    doc.add_paragraph(f"• {text}")
+                self._add_paragraph_with_links(doc, element, prefix="• ")
            
            elif element.name == 'p':
-                text = element.get_text(strip=True)
-                if text:
-                    doc.add_paragraph(text)
+                self._add_paragraph_with_links(doc, element)
            
            elif element.name == 'div':
                # 处理特殊的 div 内容块
                if element.get('class') and any('param' in c for c in element.get('class', [])):
-                    text = element.get_text(strip=True)
-                    if text:
-                        doc.add_paragraph(text)
+                    self._add_paragraph_with_links(doc, element)
    
    def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
        """