crawl4zeroerr/zeroerr_crawler/product_crawler.py

"""
产品页面爬虫
专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
"""

from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os

from .base_crawler import BaseCrawler
from .utils import safe_filename


class ProductCrawler(BaseCrawler):
    """
    产品页面爬虫
    针对 eRob 和 eCoder 产品页面的特殊处理
    """

    def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
        """
        提取产品页面主内容
        产品页面结构较复杂，需要特殊处理
        """
        # 尝试多种选择器
        selectors = [
            ('div', 'eRob_page_right'),      # eRob 页面右侧内容
            ('div', 'eCoder_page_main'),     # eCoder 页面主内容
            ('div', 'product_page_main'),    # 通用产品页面
            ('div', 'news_text_p'),          # 新闻类布局
        ]

        for tag, class_name in selectors:
            content = soup.find(tag, class_=class_name)
            if content:
                return content

        # 如果都没找到，尝试从配置的选择器
        return super().extract_content(soup)

    def extract_title(self, soup: BeautifulSoup, url: str) -> str:
        """
        提取产品页面标题
        产品页面标题可能在不同位置
        """
        # 优先使用配置中的选择器（支持 h1, h2 等）
        selector = self.config.get("title_selector", "h1")
        index = self.config.get("title_index", 0)

        # 支持多个选择器，用逗号分隔
        selectors = [s.strip() for s in selector.split(',')]

        # 收集所有匹配的标签
        all_tags = []
        for sel in selectors:
            # 对于简单的标签名（如 "h1", "h2"），直接查找
            if sel in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
                found_tags = soup.find_all(sel)
                all_tags.extend(found_tags)
            else:
                # 对于其他选择器，尝试查找
                found_tags = soup.find_all(sel)
                all_tags.extend(found_tags)

        # 优先从配置的选择器提取
        if all_tags and len(all_tags) > index:
            title = all_tags[index].get_text(strip=True)
            # 跳过网站名称
            if title and '零差云控' not in title and '零误差' not in title:
                return title
        elif all_tags:
            # 如果指定索引的标签被跳过，尝试其他标签
            for tag in all_tags:
                title = tag.get_text(strip=True)
                # 跳过网站名称
                if title and '零差云控' not in title and '零误差' not in title:
                    return title

        # 尝试从页面 title 标签提取
        title_tag = soup.find('title')
        if title_tag:
            title = title_tag.get_text(strip=True)
            # 移除网站名称后缀（如 " - 零差云控"）
            if ' - ' in title:
                title = title.split(' - ')[0].strip()
            if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
                return title

        # 最后从 URL 提取
        url_part = url.split('/')[-1].replace('.html', '')
        # 将连字符替换为空格，并首字母大写
        if '-' in url_part:
            url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
        return url_part

    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None):
        """
        将产品内容添加到 Word 文档
        针对产品页面的表格等进行优化处理

        Args:
            doc: Document 对象
            content: 内容区域
            output_dir: 输出目录（用于解析图片路径）
            page_title: 页面标题（如果提供，会跳过内容中与标题重复的h1/h2标签或包含标题的段落）
        """
        # 如果提供了页面标题，创建内容副本并移除重复的标题元素
        if page_title:
            content = BeautifulSoup(str(content), 'html.parser')

            # 移除与标题完全相同的第一个h1
            first_h1 = content.find('h1')
            if first_h1:
                h1_text = first_h1.get_text(strip=True)
                if h1_text == page_title:
                    first_h1.decompose()

            # 移除与标题完全相同的第一个h2
            first_h2 = content.find('h2')
            if first_h2:
                h2_text = first_h2.get_text(strip=True)
                if h2_text == page_title:
                    first_h2.decompose()

            # 检查标题是否包含"型号："前缀，如果是，也移除内容中只包含产品名称的h2
            # 例如：标题是"型号：eCoder11"，内容中有"eCoder11"的h2
            if '型号：' in page_title or '型号:' in page_title:
                product_name = page_title.replace('型号：', '').replace('型号:', '').strip()
                if product_name:
                    # 查找第一个只包含产品名称的h2
                    for h2 in content.find_all('h2'):
                        h2_text = h2.get_text(strip=True)
                        if h2_text == product_name:
                            h2.decompose()
                            break  # 只移除第一个匹配的

        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
            # 跳过嵌套元素
            if element.find_parent(['table', 'li']):
                continue

            if element.name == 'img':
                src = element.get('src', '')
                if not src.startswith('http'):
                    local_path = os.path.join(output_dir, src)
                else:
                    local_path = src

                if os.path.exists(local_path):
                    try:
                        # 图片前添加空行
                        doc.add_paragraph()
                        doc.add_picture(local_path, width=Inches(4.5))
                        doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
                        # 图片后添加空行
                        doc.add_paragraph()
                    except Exception as e:
                        print(f"  Word插入图片失败: {local_path} - {e}")

            elif element.name.startswith('h'):
                text = element.get_text(strip=True)
                if text and '零差云控' not in text:
                    # 对于页面内容中的标题，h1 转换为 Heading 2，h2-h6 保持原层级
                    # 因为页面标题已经是 Heading 1，所以内容中的 h1 应该降级为 Heading 2
                    original_level = int(element.name[1])
                    if original_level == 1:
                        # 页面内容中的 h1 转换为 Heading 2
                        word_level = 2
                        print(f"    标题层级转换: h1 '{text}' → Heading 2")
                    else:
                        # h2-h6 保持原层级（h2→Heading 2, h3→Heading 3, ...）
                        word_level = original_level
                    doc.add_heading(text, level=min(word_level, 9))

            elif element.name == 'table':
                # 处理表格
                self._add_table_to_docx(doc, element)

            elif element.name == 'li':
                text = element.get_text(strip=True)
                if text:
                    doc.add_paragraph(f"• {text}")

            elif element.name == 'p':
                text = element.get_text(strip=True)
                if text:
                    doc.add_paragraph(text)

            elif element.name == 'div':
                # 处理特殊的 div 内容块
                if element.get('class') and any('param' in c for c in element.get('class', [])):
                    text = element.get_text(strip=True)
                    if text:
                        doc.add_paragraph(text)

    def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
        """
        将 HTML 表格添加到 Word 文档

        Args:
            doc: Document 对象
            table_element: 表格元素
        """
        rows = table_element.find_all('tr')
        if not rows:
            return

        # 获取最大列数
        max_cols = 0
        for row in rows:
            cells = row.find_all(['td', 'th'])
            max_cols = max(max_cols, len(cells))

        if max_cols == 0:
            return

        # 创建 Word 表格
        try:
            word_table = doc.add_table(rows=len(rows), cols=max_cols)
            word_table.style = 'Table Grid'

            for i, row in enumerate(rows):
                cells = row.find_all(['td', 'th'])
                for j, cell in enumerate(cells):
                    if j < max_cols:
                        text = cell.get_text(strip=True)
                        word_table.rows[i].cells[j].text = text
        except Exception as e:
            # 如果表格创建失败，降级为文本
            print(f"  表格创建失败，降级为文本: {e}")
            for row in rows:
                cells = row.find_all(['td', 'th'])
                row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
                if row_text.strip():
                    doc.add_paragraph(row_text)