初始提交:零差云控官网爬虫项目
This commit is contained in:
152
zeroerr_crawler/product_crawler.py
Normal file
152
zeroerr_crawler/product_crawler.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
产品页面爬虫
|
||||
专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
|
||||
"""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
import os
|
||||
|
||||
from .base_crawler import BaseCrawler
|
||||
from .utils import safe_filename
|
||||
|
||||
|
||||
class ProductCrawler(BaseCrawler):
|
||||
"""
|
||||
产品页面爬虫
|
||||
针对 eRob 和 eCoder 产品页面的特殊处理
|
||||
"""
|
||||
|
||||
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
|
||||
"""
|
||||
提取产品页面主内容
|
||||
产品页面结构较复杂,需要特殊处理
|
||||
"""
|
||||
# 尝试多种选择器
|
||||
selectors = [
|
||||
('div', 'eRob_page_right'), # eRob 页面右侧内容
|
||||
('div', 'eCoder_page_main'), # eCoder 页面主内容
|
||||
('div', 'product_page_main'), # 通用产品页面
|
||||
('div', 'news_text_p'), # 新闻类布局
|
||||
]
|
||||
|
||||
for tag, class_name in selectors:
|
||||
content = soup.find(tag, class_=class_name)
|
||||
if content:
|
||||
return content
|
||||
|
||||
# 如果都没找到,尝试从配置的选择器
|
||||
return super().extract_content(soup)
|
||||
|
||||
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
|
||||
"""
|
||||
提取产品页面标题
|
||||
产品页面标题可能在不同位置
|
||||
"""
|
||||
# 尝试从面包屑导航后的第一个 h1
|
||||
h1_tags = soup.find_all('h1')
|
||||
for h1 in h1_tags:
|
||||
text = h1.get_text(strip=True)
|
||||
# 跳过网站名称
|
||||
if '零差云控' in text or '零误差' in text:
|
||||
continue
|
||||
if text:
|
||||
return text
|
||||
|
||||
# 从 URL 提取
|
||||
return url.split('/')[-1].replace('.html', '')
|
||||
|
||||
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
|
||||
"""
|
||||
将产品内容添加到 Word 文档
|
||||
针对产品页面的表格等进行优化处理
|
||||
"""
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
|
||||
# 跳过嵌套元素
|
||||
if element.find_parent(['table', 'li']):
|
||||
continue
|
||||
|
||||
if element.name == 'img':
|
||||
src = element.get('src', '')
|
||||
if not src.startswith('http'):
|
||||
local_path = os.path.join(output_dir, src)
|
||||
else:
|
||||
local_path = src
|
||||
|
||||
if os.path.exists(local_path):
|
||||
try:
|
||||
doc.add_picture(local_path, width=Inches(4.5))
|
||||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
except Exception as e:
|
||||
print(f" Word插入图片失败: {local_path} - {e}")
|
||||
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text and '零差云控' not in text:
|
||||
level = int(element.name[1])
|
||||
doc.add_heading(text, level=min(level + 1, 9))
|
||||
|
||||
elif element.name == 'table':
|
||||
# 处理表格
|
||||
self._add_table_to_docx(doc, element)
|
||||
|
||||
elif element.name == 'li':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(f"• {text}")
|
||||
|
||||
elif element.name == 'p':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
|
||||
elif element.name == 'div':
|
||||
# 处理特殊的 div 内容块
|
||||
if element.get('class') and any('param' in c for c in element.get('class', [])):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
|
||||
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
|
||||
"""
|
||||
将 HTML 表格添加到 Word 文档
|
||||
|
||||
Args:
|
||||
doc: Document 对象
|
||||
table_element: 表格元素
|
||||
"""
|
||||
rows = table_element.find_all('tr')
|
||||
if not rows:
|
||||
return
|
||||
|
||||
# 获取最大列数
|
||||
max_cols = 0
|
||||
for row in rows:
|
||||
cells = row.find_all(['td', 'th'])
|
||||
max_cols = max(max_cols, len(cells))
|
||||
|
||||
if max_cols == 0:
|
||||
return
|
||||
|
||||
# 创建 Word 表格
|
||||
try:
|
||||
word_table = doc.add_table(rows=len(rows), cols=max_cols)
|
||||
word_table.style = 'Table Grid'
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
cells = row.find_all(['td', 'th'])
|
||||
for j, cell in enumerate(cells):
|
||||
if j < max_cols:
|
||||
text = cell.get_text(strip=True)
|
||||
word_table.rows[i].cells[j].text = text
|
||||
except Exception as e:
|
||||
# 如果表格创建失败,降级为文本
|
||||
print(f" 表格创建失败,降级为文本: {e}")
|
||||
for row in rows:
|
||||
cells = row.find_all(['td', 'th'])
|
||||
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
|
||||
if row_text.strip():
|
||||
doc.add_paragraph(row_text)
|
||||
|
||||
Reference in New Issue
Block a user