Files
crawl4zeroerr/zeroerr_crawler/product_crawler.py

159 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
产品页面爬虫
专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
"""
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os
from .base_crawler import BaseCrawler
from .utils import safe_filename
class ProductCrawler(BaseCrawler):
"""
产品页面爬虫
针对 eRob 和 eCoder 产品页面的特殊处理
"""
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
"""
提取产品页面主内容
产品页面结构较复杂,需要特殊处理
"""
# 尝试多种选择器
selectors = [
('div', 'eRob_page_right'), # eRob 页面右侧内容
('div', 'eCoder_page_main'), # eCoder 页面主内容
('div', 'product_page_main'), # 通用产品页面
('div', 'news_text_p'), # 新闻类布局
]
for tag, class_name in selectors:
content = soup.find(tag, class_=class_name)
if content:
return content
# 如果都没找到,尝试从配置的选择器
return super().extract_content(soup)
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
"""
提取产品页面标题
产品页面标题可能在不同位置
"""
# 尝试从面包屑导航后的第一个 h1
h1_tags = soup.find_all('h1')
for h1 in h1_tags:
text = h1.get_text(strip=True)
# 跳过网站名称
if '零差云控' in text or '零误差' in text:
continue
if text:
return text
# 从 URL 提取
return url.split('/')[-1].replace('.html', '')
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
"""
将产品内容添加到 Word 文档
针对产品页面的表格等进行优化处理
"""
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
# 跳过嵌套元素
if element.find_parent(['table', 'li']):
continue
if element.name == 'img':
src = element.get('src', '')
if not src.startswith('http'):
local_path = os.path.join(output_dir, src)
else:
local_path = src
if os.path.exists(local_path):
try:
# 图片前添加空行
doc.add_paragraph()
doc.add_picture(local_path, width=Inches(4.5))
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
# 图片后添加空行
doc.add_paragraph()
except Exception as e:
print(f" Word插入图片失败: {local_path} - {e}")
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text and '零差云控' not in text:
# HTML h1-h6 直接映射到 Word Heading 1-6
# 限制在 1-9 范围内Word 支持的最大标题级别)
level = int(element.name[1])
doc.add_heading(text, level=min(level, 9))
elif element.name == 'table':
# 处理表格
self._add_table_to_docx(doc, element)
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(f"{text}")
elif element.name == 'p':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
elif element.name == 'div':
# 处理特殊的 div 内容块
if element.get('class') and any('param' in c for c in element.get('class', [])):
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
"""
将 HTML 表格添加到 Word 文档
Args:
doc: Document 对象
table_element: 表格元素
"""
rows = table_element.find_all('tr')
if not rows:
return
# 获取最大列数
max_cols = 0
for row in rows:
cells = row.find_all(['td', 'th'])
max_cols = max(max_cols, len(cells))
if max_cols == 0:
return
# 创建 Word 表格
try:
word_table = doc.add_table(rows=len(rows), cols=max_cols)
word_table.style = 'Table Grid'
for i, row in enumerate(rows):
cells = row.find_all(['td', 'th'])
for j, cell in enumerate(cells):
if j < max_cols:
text = cell.get_text(strip=True)
word_table.rows[i].cells[j].text = text
except Exception as e:
# 如果表格创建失败,降级为文本
print(f" 表格创建失败,降级为文本: {e}")
for row in rows:
cells = row.find_all(['td', 'th'])
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
if row_text.strip():
doc.add_paragraph(row_text)