""" 产品页面爬虫 专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页 """ from bs4 import BeautifulSoup from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH import os from .base_crawler import BaseCrawler from .utils import safe_filename class ProductCrawler(BaseCrawler): """ 产品页面爬虫 针对 eRob 和 eCoder 产品页面的特殊处理 """ def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None: """ 提取产品页面主内容 产品页面结构较复杂,需要特殊处理 """ # 尝试多种选择器 selectors = [ ('div', 'eRob_page_right'), # eRob 页面右侧内容 ('div', 'eCoder_page_main'), # eCoder 页面主内容 ('div', 'product_page_main'), # 通用产品页面 ('div', 'news_text_p'), # 新闻类布局 ] for tag, class_name in selectors: content = soup.find(tag, class_=class_name) if content: return content # 如果都没找到,尝试从配置的选择器 return super().extract_content(soup) def extract_title(self, soup: BeautifulSoup, url: str) -> str: """ 提取产品页面标题 产品页面标题可能在不同位置 """ # 优先使用配置中的选择器(支持 h1, h2 等) selector = self.config.get("title_selector", "h1") index = self.config.get("title_index", 0) # 支持多个选择器,用逗号分隔 selectors = [s.strip() for s in selector.split(',')] # 收集所有匹配的标签 all_tags = [] for sel in selectors: # 对于简单的标签名(如 "h1", "h2"),直接查找 if sel in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']: found_tags = soup.find_all(sel) all_tags.extend(found_tags) else: # 对于其他选择器,尝试查找 found_tags = soup.find_all(sel) all_tags.extend(found_tags) # 优先从配置的选择器提取 if all_tags and len(all_tags) > index: title = all_tags[index].get_text(strip=True) # 跳过网站名称 if title and '零差云控' not in title and '零误差' not in title: return title elif all_tags: # 如果指定索引的标签被跳过,尝试其他标签 for tag in all_tags: title = tag.get_text(strip=True) # 跳过网站名称 if title and '零差云控' not in title and '零误差' not in title: return title # 尝试从页面 title 标签提取 title_tag = soup.find('title') if title_tag: title = title_tag.get_text(strip=True) # 移除网站名称后缀(如 " - 零差云控") if ' - ' in title: title = title.split(' - ')[0].strip() if title and title.lower() not in ['about-us', 'contact-us', 'join-us']: return title # 最后从 URL 提取 url_part = url.split('/')[-1].replace('.html', '') # 将连字符替换为空格,并首字母大写 if '-' in url_part: url_part = ' '.join(word.capitalize() for word in url_part.split('-')) return url_part def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None): """ 将产品内容添加到 Word 文档 针对产品页面的表格等进行优化处理 Args: doc: Document 对象 content: 内容区域 output_dir: 输出目录(用于解析图片路径) page_title: 页面标题(如果提供,会跳过内容中与标题重复的h1/h2标签或包含标题的段落) """ # 如果提供了页面标题,创建内容副本并移除重复的标题元素 if page_title: content = BeautifulSoup(str(content), 'html.parser') # 移除与标题完全相同的第一个h1 first_h1 = content.find('h1') if first_h1: h1_text = first_h1.get_text(strip=True) if h1_text == page_title: first_h1.decompose() # 移除与标题完全相同的第一个h2 first_h2 = content.find('h2') if first_h2: h2_text = first_h2.get_text(strip=True) if h2_text == page_title: first_h2.decompose() # 检查标题是否包含"型号:"前缀,如果是,也移除内容中只包含产品名称的h2 # 例如:标题是"型号:eCoder11",内容中有"eCoder11"的h2 if '型号:' in page_title or '型号:' in page_title: product_name = page_title.replace('型号:', '').replace('型号:', '').strip() if product_name: # 查找第一个只包含产品名称的h2 for h2 in content.find_all('h2'): h2_text = h2.get_text(strip=True) if h2_text == product_name: h2.decompose() break # 只移除第一个匹配的 for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']): # 跳过嵌套元素 if element.find_parent(['table', 'li']): continue if element.name == 'img': src = element.get('src', '') if not src.startswith('http'): local_path = os.path.join(output_dir, src) else: local_path = src if os.path.exists(local_path): try: # 图片前添加空行 doc.add_paragraph() doc.add_picture(local_path, width=Inches(4.5)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER # 图片后添加空行 doc.add_paragraph() except Exception as e: print(f" Word插入图片失败: {local_path} - {e}") elif element.name.startswith('h'): text = element.get_text(strip=True) if text and '零差云控' not in text: # HTML h1-h6 直接映射到 Word Heading 1-6 # 限制在 1-9 范围内(Word 支持的最大标题级别) level = int(element.name[1]) doc.add_heading(text, level=min(level, 9)) elif element.name == 'table': # 处理表格 self._add_table_to_docx(doc, element) elif element.name == 'li': text = element.get_text(strip=True) if text: doc.add_paragraph(f"• {text}") elif element.name == 'p': text = element.get_text(strip=True) if text: doc.add_paragraph(text) elif element.name == 'div': # 处理特殊的 div 内容块 if element.get('class') and any('param' in c for c in element.get('class', [])): text = element.get_text(strip=True) if text: doc.add_paragraph(text) def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup): """ 将 HTML 表格添加到 Word 文档 Args: doc: Document 对象 table_element: 表格元素 """ rows = table_element.find_all('tr') if not rows: return # 获取最大列数 max_cols = 0 for row in rows: cells = row.find_all(['td', 'th']) max_cols = max(max_cols, len(cells)) if max_cols == 0: return # 创建 Word 表格 try: word_table = doc.add_table(rows=len(rows), cols=max_cols) word_table.style = 'Table Grid' for i, row in enumerate(rows): cells = row.find_all(['td', 'th']) for j, cell in enumerate(cells): if j < max_cols: text = cell.get_text(strip=True) word_table.rows[i].cells[j].text = text except Exception as e: # 如果表格创建失败,降级为文本 print(f" 表格创建失败,降级为文本: {e}") for row in rows: cells = row.find_all(['td', 'th']) row_text = ' | '.join([cell.get_text(strip=True) for cell in cells]) if row_text.strip(): doc.add_paragraph(row_text)