Files
crawl4zeroerr/zeroerr_crawler/product_crawler.py

239 lines
9.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
产品页面爬虫
专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
"""
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os
from .base_crawler import BaseCrawler
from .utils import safe_filename
class ProductCrawler(BaseCrawler):
"""
产品页面爬虫
针对 eRob 和 eCoder 产品页面的特殊处理
"""
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
"""
提取产品页面主内容
产品页面结构较复杂,需要特殊处理
"""
# 尝试多种选择器
selectors = [
('div', 'eRob_page_right'), # eRob 页面右侧内容
('div', 'eCoder_page_main'), # eCoder 页面主内容
('div', 'product_page_main'), # 通用产品页面
('div', 'news_text_p'), # 新闻类布局
]
for tag, class_name in selectors:
content = soup.find(tag, class_=class_name)
if content:
return content
# 如果都没找到,尝试从配置的选择器
return super().extract_content(soup)
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
"""
提取产品页面标题
产品页面标题可能在不同位置
"""
# 优先使用配置中的选择器(支持 h1, h2 等)
selector = self.config.get("title_selector", "h1")
index = self.config.get("title_index", 0)
# 支持多个选择器,用逗号分隔
selectors = [s.strip() for s in selector.split(',')]
# 收集所有匹配的标签
all_tags = []
for sel in selectors:
# 对于简单的标签名(如 "h1", "h2"),直接查找
if sel in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
found_tags = soup.find_all(sel)
all_tags.extend(found_tags)
else:
# 对于其他选择器,尝试查找
found_tags = soup.find_all(sel)
all_tags.extend(found_tags)
# 优先从配置的选择器提取
if all_tags and len(all_tags) > index:
title = all_tags[index].get_text(strip=True)
# 跳过网站名称
if title and '零差云控' not in title and '零误差' not in title:
return title
elif all_tags:
# 如果指定索引的标签被跳过,尝试其他标签
for tag in all_tags:
title = tag.get_text(strip=True)
# 跳过网站名称
if title and '零差云控' not in title and '零误差' not in title:
return title
# 尝试从页面 title 标签提取
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text(strip=True)
# 移除网站名称后缀(如 " - 零差云控"
if ' - ' in title:
title = title.split(' - ')[0].strip()
if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
return title
# 最后从 URL 提取
url_part = url.split('/')[-1].replace('.html', '')
# 将连字符替换为空格,并首字母大写
if '-' in url_part:
url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
return url_part
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None):
"""
将产品内容添加到 Word 文档
针对产品页面的表格等进行优化处理
Args:
doc: Document 对象
content: 内容区域
output_dir: 输出目录(用于解析图片路径)
page_title: 页面标题如果提供会跳过内容中与标题重复的h1/h2标签或包含标题的段落
"""
# 如果提供了页面标题,创建内容副本并移除重复的标题元素
if page_title:
content = BeautifulSoup(str(content), 'html.parser')
# 移除与标题完全相同的第一个h1
first_h1 = content.find('h1')
if first_h1:
h1_text = first_h1.get_text(strip=True)
if h1_text == page_title:
first_h1.decompose()
# 移除与标题完全相同的第一个h2
first_h2 = content.find('h2')
if first_h2:
h2_text = first_h2.get_text(strip=True)
if h2_text == page_title:
first_h2.decompose()
# 检查标题是否包含"型号:"前缀如果是也移除内容中只包含产品名称的h2
# 例如:标题是"型号eCoder11",内容中有"eCoder11"的h2
if '型号:' in page_title or '型号:' in page_title:
product_name = page_title.replace('型号:', '').replace('型号:', '').strip()
if product_name:
# 查找第一个只包含产品名称的h2
for h2 in content.find_all('h2'):
h2_text = h2.get_text(strip=True)
if h2_text == product_name:
h2.decompose()
break # 只移除第一个匹配的
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
# 跳过嵌套元素
if element.find_parent(['table', 'li']):
continue
if element.name == 'img':
src = element.get('src', '')
if not src.startswith('http'):
local_path = os.path.join(output_dir, src)
else:
local_path = src
if os.path.exists(local_path):
try:
# 图片前添加空行
doc.add_paragraph()
doc.add_picture(local_path, width=Inches(4.5))
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
# 图片后添加空行
doc.add_paragraph()
except Exception as e:
print(f" Word插入图片失败: {local_path} - {e}")
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text and '零差云控' not in text:
# 对于页面内容中的标题h1 转换为 Heading 2h2-h6 保持原层级
# 因为页面标题已经是 Heading 1所以内容中的 h1 应该降级为 Heading 2
original_level = int(element.name[1])
if original_level == 1:
# 页面内容中的 h1 转换为 Heading 2
word_level = 2
print(f" 标题层级转换: h1 '{text}' → Heading 2")
else:
# h2-h6 保持原层级h2→Heading 2, h3→Heading 3, ...
word_level = original_level
doc.add_heading(text, level=min(word_level, 9))
elif element.name == 'table':
# 处理表格
self._add_table_to_docx(doc, element)
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(f"{text}")
elif element.name == 'p':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
elif element.name == 'div':
# 处理特殊的 div 内容块
if element.get('class') and any('param' in c for c in element.get('class', [])):
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
"""
将 HTML 表格添加到 Word 文档
Args:
doc: Document 对象
table_element: 表格元素
"""
rows = table_element.find_all('tr')
if not rows:
return
# 获取最大列数
max_cols = 0
for row in rows:
cells = row.find_all(['td', 'th'])
max_cols = max(max_cols, len(cells))
if max_cols == 0:
return
# 创建 Word 表格
try:
word_table = doc.add_table(rows=len(rows), cols=max_cols)
word_table.style = 'Table Grid'
for i, row in enumerate(rows):
cells = row.find_all(['td', 'th'])
for j, cell in enumerate(cells):
if j < max_cols:
text = cell.get_text(strip=True)
word_table.rows[i].cells[j].text = text
except Exception as e:
# 如果表格创建失败,降级为文本
print(f" 表格创建失败,降级为文本: {e}")
for row in rows:
cells = row.find_all(['td', 'th'])
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
if row_text.strip():
doc.add_paragraph(row_text)