""" 产品页面爬虫 专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页 """ from bs4 import BeautifulSoup from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH import os from .base_crawler import BaseCrawler from .utils import safe_filename class ProductCrawler(BaseCrawler): """ 产品页面爬虫 针对 eRob 和 eCoder 产品页面的特殊处理 """ def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None: """ 提取产品页面主内容 产品页面结构较复杂,需要特殊处理 """ # 尝试多种选择器 selectors = [ ('div', 'eRob_page_right'), # eRob 页面右侧内容 ('div', 'eCoder_page_main'), # eCoder 页面主内容 ('div', 'product_page_main'), # 通用产品页面 ('div', 'news_text_p'), # 新闻类布局 ] for tag, class_name in selectors: content = soup.find(tag, class_=class_name) if content: return content # 如果都没找到,尝试从配置的选择器 return super().extract_content(soup) def extract_title(self, soup: BeautifulSoup, url: str) -> str: """ 提取产品页面标题 产品页面标题可能在不同位置 """ # 尝试从面包屑导航后的第一个 h1 h1_tags = soup.find_all('h1') for h1 in h1_tags: text = h1.get_text(strip=True) # 跳过网站名称 if '零差云控' in text or '零误差' in text: continue if text: return text # 从 URL 提取 return url.split('/')[-1].replace('.html', '') def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str): """ 将产品内容添加到 Word 文档 针对产品页面的表格等进行优化处理 """ for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']): # 跳过嵌套元素 if element.find_parent(['table', 'li']): continue if element.name == 'img': src = element.get('src', '') if not src.startswith('http'): local_path = os.path.join(output_dir, src) else: local_path = src if os.path.exists(local_path): try: # 图片前添加空行 doc.add_paragraph() doc.add_picture(local_path, width=Inches(4.5)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER # 图片后添加空行 doc.add_paragraph() except Exception as e: print(f" Word插入图片失败: {local_path} - {e}") elif element.name.startswith('h'): text = element.get_text(strip=True) if text and '零差云控' not in text: # HTML h1-h6 直接映射到 Word Heading 1-6 # 限制在 1-9 范围内(Word 支持的最大标题级别) level = int(element.name[1]) doc.add_heading(text, level=min(level, 9)) elif element.name == 'table': # 处理表格 self._add_table_to_docx(doc, element) elif element.name == 'li': text = element.get_text(strip=True) if text: doc.add_paragraph(f"• {text}") elif element.name == 'p': text = element.get_text(strip=True) if text: doc.add_paragraph(text) elif element.name == 'div': # 处理特殊的 div 内容块 if element.get('class') and any('param' in c for c in element.get('class', [])): text = element.get_text(strip=True) if text: doc.add_paragraph(text) def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup): """ 将 HTML 表格添加到 Word 文档 Args: doc: Document 对象 table_element: 表格元素 """ rows = table_element.find_all('tr') if not rows: return # 获取最大列数 max_cols = 0 for row in rows: cells = row.find_all(['td', 'th']) max_cols = max(max_cols, len(cells)) if max_cols == 0: return # 创建 Word 表格 try: word_table = doc.add_table(rows=len(rows), cols=max_cols) word_table.style = 'Table Grid' for i, row in enumerate(rows): cells = row.find_all(['td', 'th']) for j, cell in enumerate(cells): if j < max_cols: text = cell.get_text(strip=True) word_table.rows[i].cells[j].text = text except Exception as e: # 如果表格创建失败,降级为文本 print(f" 表格创建失败,降级为文本: {e}") for row in rows: cells = row.find_all(['td', 'th']) row_text = ' | '.join([cell.get_text(strip=True) for cell in cells]) if row_text.strip(): doc.add_paragraph(row_text)