初始提交：零差云控官网爬虫项目

2026-01-29 17:29:14 +08:00
commit 51b67b9e68
406 changed files with 14247 additions and 0 deletions
--- a/zeroerr_crawler/product_crawler.py
+++ b/zeroerr_crawler/product_crawler.py
@@ -0,0 +1,152 @@
+"""
+产品页面爬虫
+专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
+"""
+
+from bs4 import BeautifulSoup
+from docx import Document
+from docx.shared import Inches
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+import os
+
+from .base_crawler import BaseCrawler
+from .utils import safe_filename
+
+
+class ProductCrawler(BaseCrawler):
+    """
+    产品页面爬虫
+    针对 eRob 和 eCoder 产品页面的特殊处理
+    """
+    
+    def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
+        """
+        提取产品页面主内容
+        产品页面结构较复杂，需要特殊处理
+        """
+        # 尝试多种选择器
+        selectors = [
+            ('div', 'eRob_page_right'),      # eRob 页面右侧内容
+            ('div', 'eCoder_page_main'),     # eCoder 页面主内容
+            ('div', 'product_page_main'),    # 通用产品页面
+            ('div', 'news_text_p'),          # 新闻类布局
+        ]
+        
+        for tag, class_name in selectors:
+            content = soup.find(tag, class_=class_name)
+            if content:
+                return content
+        
+        # 如果都没找到，尝试从配置的选择器
+        return super().extract_content(soup)
+    
+    def extract_title(self, soup: BeautifulSoup, url: str) -> str:
+        """
+        提取产品页面标题
+        产品页面标题可能在不同位置
+        """
+        # 尝试从面包屑导航后的第一个 h1
+        h1_tags = soup.find_all('h1')
+        for h1 in h1_tags:
+            text = h1.get_text(strip=True)
+            # 跳过网站名称
+            if '零差云控' in text or '零误差' in text:
+                continue
+            if text:
+                return text
+        
+        # 从 URL 提取
+        return url.split('/')[-1].replace('.html', '')
+    
+    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
+        """
+        将产品内容添加到 Word 文档
+        针对产品页面的表格等进行优化处理
+        """
+        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
+            # 跳过嵌套元素
+            if element.find_parent(['table', 'li']):
+                continue
+            
+            if element.name == 'img':
+                src = element.get('src', '')
+                if not src.startswith('http'):
+                    local_path = os.path.join(output_dir, src)
+                else:
+                    local_path = src
+                
+                if os.path.exists(local_path):
+                    try:
+                        doc.add_picture(local_path, width=Inches(4.5))
+                        doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    except Exception as e:
+                        print(f"  Word插入图片失败: {local_path} - {e}")
+            
+            elif element.name.startswith('h'):
+                text = element.get_text(strip=True)
+                if text and '零差云控' not in text:
+                    level = int(element.name[1])
+                    doc.add_heading(text, level=min(level + 1, 9))
+            
+            elif element.name == 'table':
+                # 处理表格
+                self._add_table_to_docx(doc, element)
+            
+            elif element.name == 'li':
+                text = element.get_text(strip=True)
+                if text:
+                    doc.add_paragraph(f"• {text}")
+            
+            elif element.name == 'p':
+                text = element.get_text(strip=True)
+                if text:
+                    doc.add_paragraph(text)
+            
+            elif element.name == 'div':
+                # 处理特殊的 div 内容块
+                if element.get('class') and any('param' in c for c in element.get('class', [])):
+                    text = element.get_text(strip=True)
+                    if text:
+                        doc.add_paragraph(text)
+    
+    def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
+        """
+        将 HTML 表格添加到 Word 文档
+        
+        Args:
+            doc: Document 对象
+            table_element: 表格元素
+        """
+        rows = table_element.find_all('tr')
+        if not rows:
+            return
+        
+        # 获取最大列数
+        max_cols = 0
+        for row in rows:
+            cells = row.find_all(['td', 'th'])
+            max_cols = max(max_cols, len(cells))
+        
+        if max_cols == 0:
+            return
+        
+        # 创建 Word 表格
+        try:
+            word_table = doc.add_table(rows=len(rows), cols=max_cols)
+            word_table.style = 'Table Grid'
+            
+            for i, row in enumerate(rows):
+                cells = row.find_all(['td', 'th'])
+                for j, cell in enumerate(cells):
+                    if j < max_cols:
+                        text = cell.get_text(strip=True)
+                        word_table.rows[i].cells[j].text = text
+        except Exception as e:
+            # 如果表格创建失败，降级为文本
+            print(f"  表格创建失败，降级为文本: {e}")
+            for row in rows:
+                cells = row.find_all(['td', 'th'])
+                row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
+                if row_text.strip():
+                    doc.add_paragraph(row_text)
+