初始提交:零差云控官网爬虫项目
This commit is contained in:
3
zeroerr_crawler/__init__.py
Normal file
3
zeroerr_crawler/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# ZeroErr 零差云控官网爬虫
|
||||
# 用于爬取网站内容生成 RAGFlow 知识库文档
|
||||
|
||||
460
zeroerr_crawler/base_crawler.py
Normal file
460
zeroerr_crawler/base_crawler.py
Normal file
@@ -0,0 +1,460 @@
|
||||
"""
|
||||
基础爬虫类
|
||||
提供通用的爬取逻辑
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import copy
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import markdownify
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from urllib.parse import urljoin
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
|
||||
from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""
|
||||
基础爬虫类
|
||||
提供通用的页面获取、内容提取、文档生成功能
|
||||
"""
|
||||
|
||||
def __init__(self, task_config: dict):
|
||||
"""
|
||||
初始化爬虫
|
||||
|
||||
Args:
|
||||
task_config: 任务配置字典
|
||||
"""
|
||||
self.config = task_config
|
||||
self.name = task_config.get("name", "未命名")
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(HEADERS)
|
||||
|
||||
# 输出目录
|
||||
self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name))
|
||||
self.images_dir = os.path.join(self.output_dir, "images")
|
||||
ensure_dir(self.output_dir)
|
||||
ensure_dir(self.images_dir)
|
||||
|
||||
def fetch_page(self, url: str) -> BeautifulSoup | None:
|
||||
"""
|
||||
获取页面内容
|
||||
|
||||
Args:
|
||||
url: 页面URL
|
||||
|
||||
Returns:
|
||||
BeautifulSoup 对象,失败返回 None
|
||||
"""
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.encoding = 'utf-8'
|
||||
return BeautifulSoup(response.text, 'html.parser')
|
||||
except Exception as e:
|
||||
print(f" 获取页面失败: {url} - {e}")
|
||||
return None
|
||||
|
||||
def get_links_from_index(self, index_url: str) -> list[str]:
|
||||
"""
|
||||
从索引页提取子页面链接
|
||||
|
||||
Args:
|
||||
index_url: 索引页URL
|
||||
|
||||
Returns:
|
||||
链接列表
|
||||
"""
|
||||
full_url = make_absolute_url(BASE_URL, index_url)
|
||||
print(f"正在从索引页提取链接: {full_url}")
|
||||
|
||||
soup = self.fetch_page(full_url)
|
||||
if not soup:
|
||||
return []
|
||||
|
||||
link_pattern = self.config.get("link_pattern", "")
|
||||
link_suffix = self.config.get("link_suffix", ".html")
|
||||
exclude_patterns = self.config.get("exclude_patterns", [])
|
||||
|
||||
links = []
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
|
||||
# 检查是否匹配模式
|
||||
if link_pattern and link_pattern not in href:
|
||||
continue
|
||||
if link_suffix and not href.endswith(link_suffix):
|
||||
continue
|
||||
|
||||
# 检查是否需要排除
|
||||
excluded = False
|
||||
for pattern in exclude_patterns:
|
||||
if pattern in href:
|
||||
excluded = True
|
||||
break
|
||||
if excluded:
|
||||
continue
|
||||
|
||||
# 转为绝对URL
|
||||
full_link = make_absolute_url(full_url, href)
|
||||
if full_link not in links:
|
||||
links.append(full_link)
|
||||
|
||||
print(f"共发现 {len(links)} 个页面链接")
|
||||
return links
|
||||
|
||||
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
|
||||
"""
|
||||
提取页面标题
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup 对象
|
||||
url: 页面URL(用于生成默认标题)
|
||||
|
||||
Returns:
|
||||
标题文本
|
||||
"""
|
||||
selector = self.config.get("title_selector", "h1")
|
||||
index = self.config.get("title_index", 0)
|
||||
|
||||
tags = soup.find_all(selector)
|
||||
if tags and len(tags) > index:
|
||||
return tags[index].get_text(strip=True)
|
||||
elif tags:
|
||||
return tags[0].get_text(strip=True)
|
||||
else:
|
||||
# 使用URL最后一部分作为标题
|
||||
return url.split('/')[-1].replace('.html', '')
|
||||
|
||||
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
|
||||
"""
|
||||
提取页面主内容
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup 对象
|
||||
|
||||
Returns:
|
||||
内容区域的 BeautifulSoup 对象,未找到返回 None
|
||||
"""
|
||||
selector = self.config.get("content_selector", "")
|
||||
|
||||
# 支持多个选择器,用逗号分隔
|
||||
selectors = [s.strip() for s in selector.split(',')]
|
||||
|
||||
# 收集所有匹配的内容
|
||||
all_contents = []
|
||||
|
||||
for sel in selectors:
|
||||
if '.' in sel:
|
||||
# class 选择器
|
||||
tag, class_name = sel.split('.', 1)
|
||||
tag = tag if tag else 'div'
|
||||
content = soup.find(tag, class_=class_name)
|
||||
else:
|
||||
content = soup.find(sel)
|
||||
|
||||
if content:
|
||||
all_contents.append(content)
|
||||
|
||||
# 如果没有找到任何内容
|
||||
if not all_contents:
|
||||
return None
|
||||
|
||||
# 如果只找到一个,直接返回
|
||||
if len(all_contents) == 1:
|
||||
return all_contents[0]
|
||||
|
||||
# 合并多个内容区域到一个容器
|
||||
combined = soup.new_tag('div')
|
||||
for content in all_contents:
|
||||
# 深拷贝内容以避免从原DOM中移除
|
||||
combined.append(copy.deepcopy(content))
|
||||
|
||||
return combined
|
||||
|
||||
def clean_content(self, content: BeautifulSoup) -> BeautifulSoup:
|
||||
"""
|
||||
清洗内容,移除无用元素
|
||||
|
||||
Args:
|
||||
content: 内容区域
|
||||
|
||||
Returns:
|
||||
清洗后的内容
|
||||
"""
|
||||
# 移除 script 和 style 标签
|
||||
for tag in content(['script', 'style']):
|
||||
tag.decompose()
|
||||
|
||||
return content
|
||||
|
||||
def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
|
||||
"""
|
||||
处理内容中的图片,下载到本地
|
||||
|
||||
Args:
|
||||
content: 内容区域
|
||||
page_url: 页面URL(用于解析相对路径)
|
||||
|
||||
Returns:
|
||||
图片信息列表 [(原URL, 本地路径), ...]
|
||||
"""
|
||||
images_info = []
|
||||
|
||||
for img in content.find_all('img'):
|
||||
src = img.get('src')
|
||||
if not src:
|
||||
continue
|
||||
|
||||
# 转为绝对URL
|
||||
full_url = make_absolute_url(page_url, src)
|
||||
|
||||
# 下载图片
|
||||
local_path = download_image(full_url, self.images_dir)
|
||||
|
||||
if local_path:
|
||||
images_info.append((full_url, local_path))
|
||||
# 更新 img 标签的 src 为本地相对路径
|
||||
img['src'] = os.path.relpath(local_path, self.output_dir).replace('\\', '/')
|
||||
else:
|
||||
# 下载失败,保留原URL
|
||||
img['src'] = full_url
|
||||
|
||||
return images_info
|
||||
|
||||
def content_to_markdown(self, content: BeautifulSoup) -> str:
|
||||
"""
|
||||
将内容转换为 Markdown
|
||||
|
||||
Args:
|
||||
content: 内容区域
|
||||
|
||||
Returns:
|
||||
Markdown 文本
|
||||
"""
|
||||
return markdownify.markdownify(str(content), heading_style="ATX")
|
||||
|
||||
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
|
||||
"""
|
||||
将内容添加到 Word 文档
|
||||
|
||||
Args:
|
||||
doc: Document 对象
|
||||
content: 内容区域
|
||||
output_dir: 输出目录(用于解析图片路径)
|
||||
"""
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']):
|
||||
if element.name == 'img':
|
||||
src = element.get('src', '')
|
||||
# 尝试获取本地图片路径
|
||||
if not src.startswith('http'):
|
||||
local_path = os.path.join(output_dir, src)
|
||||
else:
|
||||
local_path = src
|
||||
|
||||
if os.path.exists(local_path):
|
||||
try:
|
||||
doc.add_picture(local_path, width=Inches(5))
|
||||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
except Exception as e:
|
||||
print(f" Word插入图片失败: {local_path} - {e}")
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
level = int(element.name[1])
|
||||
doc.add_heading(text, level=min(level + 1, 9))
|
||||
elif element.name == 'li':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text, style='List Bullet')
|
||||
elif element.name == 'table':
|
||||
# 简单处理表格,提取文本
|
||||
for row in element.find_all('tr'):
|
||||
cells = row.find_all(['td', 'th'])
|
||||
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
|
||||
if row_text.strip():
|
||||
doc.add_paragraph(row_text)
|
||||
else:
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
|
||||
def crawl_page(self, url: str) -> dict | None:
|
||||
"""
|
||||
爬取单个页面
|
||||
|
||||
Args:
|
||||
url: 页面URL
|
||||
|
||||
Returns:
|
||||
页面数据字典,失败返回 None
|
||||
"""
|
||||
soup = self.fetch_page(url)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
# 提取标题
|
||||
title = self.extract_title(soup, url)
|
||||
|
||||
# 提取内容
|
||||
content = self.extract_content(soup)
|
||||
if not content:
|
||||
print(f" 警告: 页面未找到主内容区域: {url}")
|
||||
return None
|
||||
|
||||
# 清洗内容
|
||||
content = self.clean_content(content)
|
||||
|
||||
# 处理图片
|
||||
images = self.process_images(content, url)
|
||||
|
||||
# 转换为 Markdown
|
||||
markdown = self.content_to_markdown(content)
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"markdown": markdown,
|
||||
"images": images,
|
||||
}
|
||||
|
||||
def save_single_page(self, page_data: dict):
|
||||
"""
|
||||
保存单个页面为独立的 md 和 docx 文件
|
||||
|
||||
Args:
|
||||
page_data: 页面数据字典
|
||||
"""
|
||||
title = page_data["title"]
|
||||
safe_title = safe_filename(title)
|
||||
|
||||
# 保存 Markdown
|
||||
md_path = os.path.join(self.output_dir, f"{safe_title}.md")
|
||||
md_content = f"# {title}\n\n"
|
||||
md_content += f"**原文链接**: {page_data['url']}\n\n"
|
||||
md_content += page_data["markdown"]
|
||||
|
||||
with open(md_path, "w", encoding="utf-8") as f:
|
||||
f.write(md_content)
|
||||
|
||||
# 保存 Word
|
||||
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
|
||||
doc = Document()
|
||||
doc.add_heading(title, 0)
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"原文链接: {page_data['url']}").italic = True
|
||||
|
||||
self.add_content_to_docx(doc, page_data["content"], self.output_dir)
|
||||
doc.save(docx_path)
|
||||
|
||||
def save_combined_documents(self, all_pages: list[dict]):
|
||||
"""
|
||||
将所有页面汇总保存为一个 md 和 docx 文件
|
||||
|
||||
Args:
|
||||
all_pages: 所有页面数据列表
|
||||
"""
|
||||
if not all_pages:
|
||||
return
|
||||
|
||||
safe_name = safe_filename(self.name)
|
||||
|
||||
# === 生成汇总 Markdown ===
|
||||
combined_md = f"# {self.name}全集\n\n"
|
||||
combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
||||
combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。\n\n"
|
||||
combined_md += "---\n\n"
|
||||
|
||||
# 添加每篇内容
|
||||
for page in all_pages:
|
||||
combined_md += f"## {page['title']}\n\n"
|
||||
combined_md += f"**原文链接**: {page['url']}\n\n"
|
||||
combined_md += page["markdown"]
|
||||
combined_md += "\n\n---\n\n"
|
||||
|
||||
md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
|
||||
with open(md_path, "w", encoding="utf-8") as f:
|
||||
f.write(combined_md)
|
||||
print(f" 汇总 Markdown: {md_path}")
|
||||
|
||||
# === 生成汇总 Word 文档 ===
|
||||
doc = Document()
|
||||
doc.add_heading(f'{self.name}全集', 0)
|
||||
|
||||
intro = doc.add_paragraph()
|
||||
intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True
|
||||
doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。")
|
||||
doc.add_page_break()
|
||||
|
||||
# 添加每篇内容
|
||||
for page in all_pages:
|
||||
doc.add_heading(page["title"], level=1)
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"原文链接: {page['url']}").italic = True
|
||||
|
||||
self.add_content_to_docx(doc, page["content"], self.output_dir)
|
||||
doc.add_page_break()
|
||||
|
||||
docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
|
||||
doc.save(docx_path)
|
||||
print(f" 汇总 Word: {docx_path}")
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
执行爬取任务
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"开始爬取: {self.name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 获取页面链接
|
||||
if "static_pages" in self.config:
|
||||
# 静态页面列表
|
||||
links = [make_absolute_url(BASE_URL, p) for p in self.config["static_pages"]]
|
||||
elif "index_url" in self.config:
|
||||
# 从索引页提取
|
||||
links = self.get_links_from_index(self.config["index_url"])
|
||||
else:
|
||||
print("错误: 配置中未指定 static_pages 或 index_url")
|
||||
return
|
||||
|
||||
if not links:
|
||||
print("未获取到链接,跳过此任务")
|
||||
return
|
||||
|
||||
# 爬取每个页面
|
||||
all_pages = [] # 存储所有成功爬取的页面数据
|
||||
|
||||
for i, url in enumerate(links):
|
||||
print(f"[{i+1}/{len(links)}] 正在抓取: {url}")
|
||||
|
||||
page_data = self.crawl_page(url)
|
||||
if page_data:
|
||||
all_pages.append(page_data)
|
||||
|
||||
# 请求延迟
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# 生成汇总文档
|
||||
if all_pages:
|
||||
print(f"\n正在生成汇总文档(共 {len(all_pages)} 篇)...")
|
||||
self.save_combined_documents(all_pages)
|
||||
|
||||
print(f"\n{self.name} 爬取完成!成功: {len(all_pages)}/{len(links)}")
|
||||
print(f"输出目录: {self.output_dir}")
|
||||
|
||||
|
||||
class StandardCrawler(BaseCrawler):
|
||||
"""
|
||||
标准爬虫类
|
||||
适用于大多数页面类型
|
||||
"""
|
||||
pass
|
||||
|
||||
133
zeroerr_crawler/config.py
Normal file
133
zeroerr_crawler/config.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""
|
||||
爬虫配置文件
|
||||
定义所有爬取任务的配置
|
||||
"""
|
||||
|
||||
BASE_URL = "https://www.zeroerr.cn"
|
||||
|
||||
# 请求头配置
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
# 请求延迟(秒),保护服务器
|
||||
REQUEST_DELAY = 0.5
|
||||
|
||||
# 输出目录
|
||||
OUTPUT_DIR = "output"
|
||||
|
||||
# 爬取任务配置
|
||||
# 每个任务定义:名称、索引页、链接筛选规则、内容选择器、标题选择器
|
||||
CRAWL_TASKS = {
|
||||
# 应用案例
|
||||
"case": {
|
||||
"name": "应用案例",
|
||||
"index_url": "/case/index.html",
|
||||
"link_pattern": "/case/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.news_text_p",
|
||||
"title_selector": "h1",
|
||||
"title_index": 1, # 使用第二个h1标签
|
||||
},
|
||||
# 常见问题
|
||||
"issue": {
|
||||
"name": "常见问题",
|
||||
"index_url": "/issue/index.html",
|
||||
"link_pattern": "/issue/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.news_text_p",
|
||||
"title_selector": "h1",
|
||||
"title_index": 1,
|
||||
},
|
||||
# 企业新闻
|
||||
"news": {
|
||||
"name": "企业新闻",
|
||||
"index_url": "/news/index.html",
|
||||
"link_pattern": "/news/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.news_text_p",
|
||||
"title_selector": "h1",
|
||||
"title_index": 1,
|
||||
},
|
||||
# 认证与资质
|
||||
"certification": {
|
||||
"name": "认证与资质",
|
||||
"index_url": "/Certification/index.html",
|
||||
"link_pattern": "/Certification/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.news_text_p",
|
||||
"title_selector": "h1",
|
||||
"title_index": 1,
|
||||
},
|
||||
# 机器人关节产品
|
||||
"erob": {
|
||||
"name": "机器人关节",
|
||||
"index_url": "/eRob/index.html",
|
||||
"link_pattern": "/eRob/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区
|
||||
"title_selector": "h1",
|
||||
"title_index": 0,
|
||||
},
|
||||
# 编码器产品
|
||||
"ecoder": {
|
||||
"name": "编码器",
|
||||
"index_url": "/eCoder/index.html",
|
||||
"link_pattern": "/eCoder/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区
|
||||
"title_selector": "h1",
|
||||
"title_index": 0,
|
||||
},
|
||||
# 配件
|
||||
"tools": {
|
||||
"name": "配件",
|
||||
"index_url": "/Tools/index.html",
|
||||
"link_pattern": "/Tools/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.product_text_l,div.product_text_l1,div.product_text,div.news_text_p,div.eLiner_banner,div.web_cable_container", # 多种布局
|
||||
"title_selector": "h1,h2", # 部分页面标题用h2
|
||||
"title_index": 0,
|
||||
},
|
||||
# 关于我们等静态页面
|
||||
"about": {
|
||||
"name": "关于我们",
|
||||
"static_pages": [
|
||||
"/about/about-us.html",
|
||||
"/about/contact-us.html",
|
||||
"/about/join-us.html",
|
||||
"/about/152.html", # 诚招代理
|
||||
],
|
||||
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
|
||||
"title_selector": "h1,h2",
|
||||
"title_index": 0,
|
||||
},
|
||||
# 服务与支持(单页面,直接抓取内容)
|
||||
"support": {
|
||||
"name": "服务与支持",
|
||||
"static_pages": [
|
||||
"/support/", # 主页面包含所有内容
|
||||
],
|
||||
"content_selector": "div.sidebar_container,div.content,div.content-section,div.news_text_p",
|
||||
"title_selector": "h2",
|
||||
"title_index": 0,
|
||||
},
|
||||
# 资料下载(静态页面)
|
||||
"download": {
|
||||
"name": "资料下载",
|
||||
"static_pages": [
|
||||
"/download/77.html", # 资料下载说明页
|
||||
],
|
||||
"content_selector": "div.news_text_p,div.news_text",
|
||||
"title_selector": "h1",
|
||||
"title_index": 0,
|
||||
},
|
||||
}
|
||||
|
||||
152
zeroerr_crawler/product_crawler.py
Normal file
152
zeroerr_crawler/product_crawler.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
产品页面爬虫
|
||||
专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
|
||||
"""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
import os
|
||||
|
||||
from .base_crawler import BaseCrawler
|
||||
from .utils import safe_filename
|
||||
|
||||
|
||||
class ProductCrawler(BaseCrawler):
|
||||
"""
|
||||
产品页面爬虫
|
||||
针对 eRob 和 eCoder 产品页面的特殊处理
|
||||
"""
|
||||
|
||||
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
|
||||
"""
|
||||
提取产品页面主内容
|
||||
产品页面结构较复杂,需要特殊处理
|
||||
"""
|
||||
# 尝试多种选择器
|
||||
selectors = [
|
||||
('div', 'eRob_page_right'), # eRob 页面右侧内容
|
||||
('div', 'eCoder_page_main'), # eCoder 页面主内容
|
||||
('div', 'product_page_main'), # 通用产品页面
|
||||
('div', 'news_text_p'), # 新闻类布局
|
||||
]
|
||||
|
||||
for tag, class_name in selectors:
|
||||
content = soup.find(tag, class_=class_name)
|
||||
if content:
|
||||
return content
|
||||
|
||||
# 如果都没找到,尝试从配置的选择器
|
||||
return super().extract_content(soup)
|
||||
|
||||
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
|
||||
"""
|
||||
提取产品页面标题
|
||||
产品页面标题可能在不同位置
|
||||
"""
|
||||
# 尝试从面包屑导航后的第一个 h1
|
||||
h1_tags = soup.find_all('h1')
|
||||
for h1 in h1_tags:
|
||||
text = h1.get_text(strip=True)
|
||||
# 跳过网站名称
|
||||
if '零差云控' in text or '零误差' in text:
|
||||
continue
|
||||
if text:
|
||||
return text
|
||||
|
||||
# 从 URL 提取
|
||||
return url.split('/')[-1].replace('.html', '')
|
||||
|
||||
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
|
||||
"""
|
||||
将产品内容添加到 Word 文档
|
||||
针对产品页面的表格等进行优化处理
|
||||
"""
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
|
||||
# 跳过嵌套元素
|
||||
if element.find_parent(['table', 'li']):
|
||||
continue
|
||||
|
||||
if element.name == 'img':
|
||||
src = element.get('src', '')
|
||||
if not src.startswith('http'):
|
||||
local_path = os.path.join(output_dir, src)
|
||||
else:
|
||||
local_path = src
|
||||
|
||||
if os.path.exists(local_path):
|
||||
try:
|
||||
doc.add_picture(local_path, width=Inches(4.5))
|
||||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
except Exception as e:
|
||||
print(f" Word插入图片失败: {local_path} - {e}")
|
||||
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text and '零差云控' not in text:
|
||||
level = int(element.name[1])
|
||||
doc.add_heading(text, level=min(level + 1, 9))
|
||||
|
||||
elif element.name == 'table':
|
||||
# 处理表格
|
||||
self._add_table_to_docx(doc, element)
|
||||
|
||||
elif element.name == 'li':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(f"• {text}")
|
||||
|
||||
elif element.name == 'p':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
|
||||
elif element.name == 'div':
|
||||
# 处理特殊的 div 内容块
|
||||
if element.get('class') and any('param' in c for c in element.get('class', [])):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
|
||||
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
|
||||
"""
|
||||
将 HTML 表格添加到 Word 文档
|
||||
|
||||
Args:
|
||||
doc: Document 对象
|
||||
table_element: 表格元素
|
||||
"""
|
||||
rows = table_element.find_all('tr')
|
||||
if not rows:
|
||||
return
|
||||
|
||||
# 获取最大列数
|
||||
max_cols = 0
|
||||
for row in rows:
|
||||
cells = row.find_all(['td', 'th'])
|
||||
max_cols = max(max_cols, len(cells))
|
||||
|
||||
if max_cols == 0:
|
||||
return
|
||||
|
||||
# 创建 Word 表格
|
||||
try:
|
||||
word_table = doc.add_table(rows=len(rows), cols=max_cols)
|
||||
word_table.style = 'Table Grid'
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
cells = row.find_all(['td', 'th'])
|
||||
for j, cell in enumerate(cells):
|
||||
if j < max_cols:
|
||||
text = cell.get_text(strip=True)
|
||||
word_table.rows[i].cells[j].text = text
|
||||
except Exception as e:
|
||||
# 如果表格创建失败,降级为文本
|
||||
print(f" 表格创建失败,降级为文本: {e}")
|
||||
for row in rows:
|
||||
cells = row.find_all(['td', 'th'])
|
||||
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
|
||||
if row_text.strip():
|
||||
doc.add_paragraph(row_text)
|
||||
|
||||
100
zeroerr_crawler/utils.py
Normal file
100
zeroerr_crawler/utils.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
工具函数模块
|
||||
提供通用的辅助功能
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from .config import HEADERS
|
||||
|
||||
|
||||
def ensure_dir(path: str) -> None:
|
||||
"""确保目录存在,不存在则创建"""
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def get_file_hash(url: str) -> str:
|
||||
"""根据URL生成唯一文件名哈希"""
|
||||
return hashlib.md5(url.encode()).hexdigest()[:12]
|
||||
|
||||
|
||||
def get_file_extension(url: str) -> str:
|
||||
"""从URL获取文件扩展名"""
|
||||
# 移除查询参数
|
||||
clean_url = url.split('?')[0]
|
||||
ext = os.path.splitext(clean_url)[1].lower()
|
||||
if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']:
|
||||
ext = '.jpg' # 默认扩展名
|
||||
return ext
|
||||
|
||||
|
||||
def download_image(img_url: str, save_dir: str, timeout: int = 15) -> str | None:
|
||||
"""
|
||||
下载图片到本地
|
||||
|
||||
Args:
|
||||
img_url: 图片URL
|
||||
save_dir: 保存目录
|
||||
timeout: 超时时间
|
||||
|
||||
Returns:
|
||||
本地文件路径,失败返回 None
|
||||
"""
|
||||
try:
|
||||
ensure_dir(save_dir)
|
||||
|
||||
url_hash = get_file_hash(img_url)
|
||||
ext = get_file_extension(img_url)
|
||||
local_filename = f"{url_hash}{ext}"
|
||||
local_path = os.path.join(save_dir, local_filename)
|
||||
|
||||
# 如果已下载过,直接返回路径
|
||||
if os.path.exists(local_path):
|
||||
return local_path
|
||||
|
||||
# 下载图片
|
||||
response = requests.get(img_url, headers=HEADERS, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
with open(local_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
return local_path
|
||||
else:
|
||||
print(f" 图片下载失败 ({response.status_code}): {img_url}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" 图片下载出错: {img_url} - {e}")
|
||||
return None
|
||||
|
||||
|
||||
def safe_filename(name: str, max_length: int = 50) -> str:
|
||||
"""
|
||||
生成安全的文件名
|
||||
|
||||
Args:
|
||||
name: 原始名称
|
||||
max_length: 最大长度
|
||||
|
||||
Returns:
|
||||
安全的文件名
|
||||
"""
|
||||
# 移除或替换不安全字符
|
||||
unsafe_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\r', '\t']
|
||||
for char in unsafe_chars:
|
||||
name = name.replace(char, '_')
|
||||
|
||||
# 去除首尾空格
|
||||
name = name.strip()
|
||||
|
||||
# 截断长度
|
||||
if len(name) > max_length:
|
||||
name = name[:max_length]
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def make_absolute_url(base_url: str, relative_url: str) -> str:
|
||||
"""将相对URL转为绝对URL"""
|
||||
return urljoin(base_url, relative_url)
|
||||
|
||||
Reference in New Issue
Block a user