更新爬虫方案文档，增加摘要提取模块以生成文档摘要；优化基础爬虫类的标题提取逻辑，支持多个选择器，调整内容处理逻辑以去除重复标题。

2026-01-31 16:34:13 +08:00
parent 3c625d1c3a
commit c707704d80
5 changed files with 355 additions and 31 deletions
--- a/zeroerr_crawler/product_crawler.py
+++ b/zeroerr_crawler/product_crawler.py
@@ -45,24 +45,97 @@ class ProductCrawler(BaseCrawler):
        提取产品页面标题
        产品页面标题可能在不同位置
        """
-        # 尝试从面包屑导航后的第一个 h1
-        h1_tags = soup.find_all('h1')
-        for h1 in h1_tags:
-            text = h1.get_text(strip=True)
-            # 跳过网站名称
-            if '零差云控' in text or '零误差' in text:
-                continue
-            if text:
-                return text
+        # 优先使用配置中的选择器（支持 h1, h2 等）
+        selector = self.config.get("title_selector", "h1")
+        index = self.config.get("title_index", 0)
        
-        # 从 URL 提取
-        return url.split('/')[-1].replace('.html', '')
+        # 支持多个选择器，用逗号分隔
+        selectors = [s.strip() for s in selector.split(',')]
+        
+        # 收集所有匹配的标签
+        all_tags = []
+        for sel in selectors:
+            # 对于简单的标签名（如 "h1", "h2"），直接查找
+            if sel in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
+                found_tags = soup.find_all(sel)
+                all_tags.extend(found_tags)
+            else:
+                # 对于其他选择器，尝试查找
+                found_tags = soup.find_all(sel)
+                all_tags.extend(found_tags)
+        
+        # 优先从配置的选择器提取
+        if all_tags and len(all_tags) > index:
+            title = all_tags[index].get_text(strip=True)
+            # 跳过网站名称
+            if title and '零差云控' not in title and '零误差' not in title:
+                return title
+        elif all_tags:
+            # 如果指定索引的标签被跳过，尝试其他标签
+            for tag in all_tags:
+                title = tag.get_text(strip=True)
+                # 跳过网站名称
+                if title and '零差云控' not in title and '零误差' not in title:
+                    return title
+        
+        # 尝试从页面 title 标签提取
+        title_tag = soup.find('title')
+        if title_tag:
+            title = title_tag.get_text(strip=True)
+            # 移除网站名称后缀（如 " - 零差云控"）
+            if ' - ' in title:
+                title = title.split(' - ')[0].strip()
+            if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
+                return title
+        
+        # 最后从 URL 提取
+        url_part = url.split('/')[-1].replace('.html', '')
+        # 将连字符替换为空格，并首字母大写
+        if '-' in url_part:
+            url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
+        return url_part
    
-    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
+    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None):
        """
        将产品内容添加到 Word 文档
        针对产品页面的表格等进行优化处理
+        
+        Args:
+            doc: Document 对象
+            content: 内容区域
+            output_dir: 输出目录（用于解析图片路径）
+            page_title: 页面标题（如果提供，会跳过内容中与标题重复的h1/h2标签或包含标题的段落）
        """
+        # 如果提供了页面标题，创建内容副本并移除重复的标题元素
+        if page_title:
+            content = BeautifulSoup(str(content), 'html.parser')
+            
+            # 移除与标题完全相同的第一个h1
+            first_h1 = content.find('h1')
+            if first_h1:
+                h1_text = first_h1.get_text(strip=True)
+                if h1_text == page_title:
+                    first_h1.decompose()
+            
+            # 移除与标题完全相同的第一个h2
+            first_h2 = content.find('h2')
+            if first_h2:
+                h2_text = first_h2.get_text(strip=True)
+                if h2_text == page_title:
+                    first_h2.decompose()
+            
+            # 检查标题是否包含"型号："前缀，如果是，也移除内容中只包含产品名称的h2
+            # 例如：标题是"型号：eCoder11"，内容中有"eCoder11"的h2
+            if '型号：' in page_title or '型号:' in page_title:
+                product_name = page_title.replace('型号：', '').replace('型号:', '').strip()
+                if product_name:
+                    # 查找第一个只包含产品名称的h2
+                    for h2 in content.find_all('h2'):
+                        h2_text = h2.get_text(strip=True)
+                        if h2_text == product_name:
+                            h2.decompose()
+                            break  # 只移除第一个匹配的
+        
        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
            # 跳过嵌套元素
            if element.find_parent(['table', 'li']):