新增标题层级处理规则：1. 新增主页链接；2.新增docx后处理，合并同一层级的标题；3. 优化层级，h1不重复

2026-02-09 18:53:32 +08:00
parent c707704d80
commit dbe9ba3629
5 changed files with 224 additions and 19 deletions
--- a/zeroerr_crawler/base_crawler.py
+++ b/zeroerr_crawler/base_crawler.py
@@ -19,6 +19,7 @@ from abc import ABC, abstractmethod
 from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
 from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
 from .extract_abstract import generate_abstract
+from .post_process import post_process_docx_headings


 class BaseCrawler(ABC):
@@ -354,11 +355,11 @@ class BaseCrawler(ABC):
        Returns:
            Markdown 文本
        """
+        # 创建内容的副本，避免修改原始内容
+        content_copy = BeautifulSoup(str(content), 'html.parser')
+        
        # 如果提供了页面标题，检查并移除内容中与标题重复的标签
        if page_title:
-            # 创建内容的副本，避免修改原始内容
-            content_copy = BeautifulSoup(str(content), 'html.parser')
-            
            # 移除与标题完全相同的第一个h1
            first_h1 = content_copy.find('h1')
            if first_h1:
@@ -384,9 +385,13 @@ class BaseCrawler(ABC):
                        if h2_text == product_name:
                            h2.decompose()
                            break  # 只移除第一个匹配的
-            
-            return markdownify.markdownify(str(content_copy), heading_style="ATX")
-        return markdownify.markdownify(str(content), heading_style="ATX")
+        
+        # 页面内容中的 h1 降级为 h2（与 Word 文档处理一致）
+        # 因为页面标题已经是二级标题（##），所以内容中的 h1 应该降级为二级标题
+        for h1 in content_copy.find_all('h1'):
+            h1.name = 'h2'
+        
+        return markdownify.markdownify(str(content_copy), heading_style="ATX")
    
    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None):
        """
@@ -431,10 +436,17 @@ class BaseCrawler(ABC):
            elif element.name.startswith('h'):
                text = element.get_text(strip=True)
                if text:
-                    # HTML h1-h6 直接映射到 Word Heading 1-6
-                    # 限制在 1-9 范围内（Word 支持的最大标题级别）
-                    level = int(element.name[1])
-                    doc.add_heading(text, level=min(level, 9))
+                    # 对于页面内容中的标题，h1 转换为 Heading 2，h2-h6 保持原层级
+                    # 因为页面标题已经是 Heading 1，所以内容中的 h1 应该降级为 Heading 2
+                    original_level = int(element.name[1])
+                    if original_level == 1:
+                        # 页面内容中的 h1 转换为 Heading 2
+                        word_level = 2
+                        print(f"    标题层级转换: h1 '{text}' → Heading 2")
+                    else:
+                        # h2-h6 保持原层级（h2→Heading 2, h3→Heading 3, ...）
+                        word_level = original_level
+                    doc.add_heading(text, level=min(word_level, 9))
                
            elif element.name in ['ul', 'ol']:
                # 列表容器，跳过（列表项会单独处理）
@@ -613,16 +625,21 @@ class BaseCrawler(ABC):
        # 合并所有页面（已存在的 + 新添加的），用于生成摘要
        all_pages_for_abstract = existing_pages + all_pages
        
+        # 获取索引页URL（如果存在）
+        index_url_full = None
+        if "index_url" in self.config:
+            index_url_full = make_absolute_url(BASE_URL, self.config["index_url"])
+        
        # 生成摘要（新建文档时生成，追加新内容时也重新生成，确保包含所有URL）
        abstract = None
        if not existing_content:
            # 新建文档：使用当前爬取的页面生成摘要
            print(f"  正在生成文档摘要...")
-            abstract = generate_abstract(all_pages, output_dir_name)
+            abstract = generate_abstract(all_pages, output_dir_name, index_url_full)
        else:
            # 追加模式：重新生成摘要，包含所有页面（已存在的 + 新添加的）
            print(f"  正在重新生成文档摘要（包含所有 {len(all_pages_for_abstract)} 篇）...")
-            abstract = generate_abstract(all_pages_for_abstract, output_dir_name)
+            abstract = generate_abstract(all_pages_for_abstract, output_dir_name, index_url_full)
        
        # 追加或创建文件
        if existing_content:
@@ -697,6 +714,8 @@ class BaseCrawler(ABC):
                    doc.add_page_break()
                doc.save(docx_path)
                print(f"  追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档")
+                # 后处理：优化连续标题
+                post_process_docx_headings(docx_path)
            else:
                print(f"  Word 文档无需更新: {docx_path}")
        else:
@@ -730,6 +749,8 @@ class BaseCrawler(ABC):
            
            doc.save(docx_path)
            print(f"  汇总 Word: {docx_path}")
+            # 后处理：优化连续标题
+            post_process_docx_headings(docx_path)
    
    def run(self):
        """