232 lines
9.1 KiB
Python
232 lines
9.1 KiB
Python
"""
|
||
产品页面爬虫
|
||
专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
|
||
"""
|
||
|
||
from bs4 import BeautifulSoup
|
||
from docx import Document
|
||
from docx.shared import Inches
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
import os
|
||
|
||
from .base_crawler import BaseCrawler
|
||
from .utils import safe_filename
|
||
|
||
|
||
class ProductCrawler(BaseCrawler):
|
||
"""
|
||
产品页面爬虫
|
||
针对 eRob 和 eCoder 产品页面的特殊处理
|
||
"""
|
||
|
||
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
|
||
"""
|
||
提取产品页面主内容
|
||
产品页面结构较复杂,需要特殊处理
|
||
"""
|
||
# 尝试多种选择器
|
||
selectors = [
|
||
('div', 'eRob_page_right'), # eRob 页面右侧内容
|
||
('div', 'eCoder_page_main'), # eCoder 页面主内容
|
||
('div', 'product_page_main'), # 通用产品页面
|
||
('div', 'news_text_p'), # 新闻类布局
|
||
]
|
||
|
||
for tag, class_name in selectors:
|
||
content = soup.find(tag, class_=class_name)
|
||
if content:
|
||
return content
|
||
|
||
# 如果都没找到,尝试从配置的选择器
|
||
return super().extract_content(soup)
|
||
|
||
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
|
||
"""
|
||
提取产品页面标题
|
||
产品页面标题可能在不同位置
|
||
"""
|
||
# 优先使用配置中的选择器(支持 h1, h2 等)
|
||
selector = self.config.get("title_selector", "h1")
|
||
index = self.config.get("title_index", 0)
|
||
|
||
# 支持多个选择器,用逗号分隔
|
||
selectors = [s.strip() for s in selector.split(',')]
|
||
|
||
# 收集所有匹配的标签
|
||
all_tags = []
|
||
for sel in selectors:
|
||
# 对于简单的标签名(如 "h1", "h2"),直接查找
|
||
if sel in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
|
||
found_tags = soup.find_all(sel)
|
||
all_tags.extend(found_tags)
|
||
else:
|
||
# 对于其他选择器,尝试查找
|
||
found_tags = soup.find_all(sel)
|
||
all_tags.extend(found_tags)
|
||
|
||
# 优先从配置的选择器提取
|
||
if all_tags and len(all_tags) > index:
|
||
title = all_tags[index].get_text(strip=True)
|
||
# 跳过网站名称
|
||
if title and '零差云控' not in title and '零误差' not in title:
|
||
return title
|
||
elif all_tags:
|
||
# 如果指定索引的标签被跳过,尝试其他标签
|
||
for tag in all_tags:
|
||
title = tag.get_text(strip=True)
|
||
# 跳过网站名称
|
||
if title and '零差云控' not in title and '零误差' not in title:
|
||
return title
|
||
|
||
# 尝试从页面 title 标签提取
|
||
title_tag = soup.find('title')
|
||
if title_tag:
|
||
title = title_tag.get_text(strip=True)
|
||
# 移除网站名称后缀(如 " - 零差云控")
|
||
if ' - ' in title:
|
||
title = title.split(' - ')[0].strip()
|
||
if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
|
||
return title
|
||
|
||
# 最后从 URL 提取
|
||
url_part = url.split('/')[-1].replace('.html', '')
|
||
# 将连字符替换为空格,并首字母大写
|
||
if '-' in url_part:
|
||
url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
|
||
return url_part
|
||
|
||
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None):
|
||
"""
|
||
将产品内容添加到 Word 文档
|
||
针对产品页面的表格等进行优化处理
|
||
|
||
Args:
|
||
doc: Document 对象
|
||
content: 内容区域
|
||
output_dir: 输出目录(用于解析图片路径)
|
||
page_title: 页面标题(如果提供,会跳过内容中与标题重复的h1/h2标签或包含标题的段落)
|
||
"""
|
||
# 如果提供了页面标题,创建内容副本并移除重复的标题元素
|
||
if page_title:
|
||
content = BeautifulSoup(str(content), 'html.parser')
|
||
|
||
# 移除与标题完全相同的第一个h1
|
||
first_h1 = content.find('h1')
|
||
if first_h1:
|
||
h1_text = first_h1.get_text(strip=True)
|
||
if h1_text == page_title:
|
||
first_h1.decompose()
|
||
|
||
# 移除与标题完全相同的第一个h2
|
||
first_h2 = content.find('h2')
|
||
if first_h2:
|
||
h2_text = first_h2.get_text(strip=True)
|
||
if h2_text == page_title:
|
||
first_h2.decompose()
|
||
|
||
# 检查标题是否包含"型号:"前缀,如果是,也移除内容中只包含产品名称的h2
|
||
# 例如:标题是"型号:eCoder11",内容中有"eCoder11"的h2
|
||
if '型号:' in page_title or '型号:' in page_title:
|
||
product_name = page_title.replace('型号:', '').replace('型号:', '').strip()
|
||
if product_name:
|
||
# 查找第一个只包含产品名称的h2
|
||
for h2 in content.find_all('h2'):
|
||
h2_text = h2.get_text(strip=True)
|
||
if h2_text == product_name:
|
||
h2.decompose()
|
||
break # 只移除第一个匹配的
|
||
|
||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
|
||
# 跳过嵌套元素
|
||
if element.find_parent(['table', 'li']):
|
||
continue
|
||
|
||
if element.name == 'img':
|
||
src = element.get('src', '')
|
||
if not src.startswith('http'):
|
||
local_path = os.path.join(output_dir, src)
|
||
else:
|
||
local_path = src
|
||
|
||
if os.path.exists(local_path):
|
||
try:
|
||
# 图片前添加空行
|
||
doc.add_paragraph()
|
||
doc.add_picture(local_path, width=Inches(4.5))
|
||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
# 图片后添加空行
|
||
doc.add_paragraph()
|
||
except Exception as e:
|
||
print(f" Word插入图片失败: {local_path} - {e}")
|
||
|
||
elif element.name.startswith('h'):
|
||
text = element.get_text(strip=True)
|
||
if text and '零差云控' not in text:
|
||
# HTML h1-h6 直接映射到 Word Heading 1-6
|
||
# 限制在 1-9 范围内(Word 支持的最大标题级别)
|
||
level = int(element.name[1])
|
||
doc.add_heading(text, level=min(level, 9))
|
||
|
||
elif element.name == 'table':
|
||
# 处理表格
|
||
self._add_table_to_docx(doc, element)
|
||
|
||
elif element.name == 'li':
|
||
text = element.get_text(strip=True)
|
||
if text:
|
||
doc.add_paragraph(f"• {text}")
|
||
|
||
elif element.name == 'p':
|
||
text = element.get_text(strip=True)
|
||
if text:
|
||
doc.add_paragraph(text)
|
||
|
||
elif element.name == 'div':
|
||
# 处理特殊的 div 内容块
|
||
if element.get('class') and any('param' in c for c in element.get('class', [])):
|
||
text = element.get_text(strip=True)
|
||
if text:
|
||
doc.add_paragraph(text)
|
||
|
||
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
|
||
"""
|
||
将 HTML 表格添加到 Word 文档
|
||
|
||
Args:
|
||
doc: Document 对象
|
||
table_element: 表格元素
|
||
"""
|
||
rows = table_element.find_all('tr')
|
||
if not rows:
|
||
return
|
||
|
||
# 获取最大列数
|
||
max_cols = 0
|
||
for row in rows:
|
||
cells = row.find_all(['td', 'th'])
|
||
max_cols = max(max_cols, len(cells))
|
||
|
||
if max_cols == 0:
|
||
return
|
||
|
||
# 创建 Word 表格
|
||
try:
|
||
word_table = doc.add_table(rows=len(rows), cols=max_cols)
|
||
word_table.style = 'Table Grid'
|
||
|
||
for i, row in enumerate(rows):
|
||
cells = row.find_all(['td', 'th'])
|
||
for j, cell in enumerate(cells):
|
||
if j < max_cols:
|
||
text = cell.get_text(strip=True)
|
||
word_table.rows[i].cells[j].text = text
|
||
except Exception as e:
|
||
# 如果表格创建失败,降级为文本
|
||
print(f" 表格创建失败,降级为文本: {e}")
|
||
for row in rows:
|
||
cells = row.find_all(['td', 'th'])
|
||
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
|
||
if row_text.strip():
|
||
doc.add_paragraph(row_text)
|
||
|