Files
crawl4zeroerr/zeroerr_crawler/base_crawler.py
2026-01-29 17:29:14 +08:00

461 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
基础爬虫类
提供通用的爬取逻辑
"""
import os
import time
import copy
import requests
from bs4 import BeautifulSoup
import markdownify
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from urllib.parse import urljoin
from abc import ABC, abstractmethod
from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
class BaseCrawler(ABC):
"""
基础爬虫类
提供通用的页面获取、内容提取、文档生成功能
"""
def __init__(self, task_config: dict):
"""
初始化爬虫
Args:
task_config: 任务配置字典
"""
self.config = task_config
self.name = task_config.get("name", "未命名")
self.session = requests.Session()
self.session.headers.update(HEADERS)
# 输出目录
self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name))
self.images_dir = os.path.join(self.output_dir, "images")
ensure_dir(self.output_dir)
ensure_dir(self.images_dir)
def fetch_page(self, url: str) -> BeautifulSoup | None:
"""
获取页面内容
Args:
url: 页面URL
Returns:
BeautifulSoup 对象,失败返回 None
"""
try:
response = self.session.get(url, timeout=30)
response.encoding = 'utf-8'
return BeautifulSoup(response.text, 'html.parser')
except Exception as e:
print(f" 获取页面失败: {url} - {e}")
return None
def get_links_from_index(self, index_url: str) -> list[str]:
"""
从索引页提取子页面链接
Args:
index_url: 索引页URL
Returns:
链接列表
"""
full_url = make_absolute_url(BASE_URL, index_url)
print(f"正在从索引页提取链接: {full_url}")
soup = self.fetch_page(full_url)
if not soup:
return []
link_pattern = self.config.get("link_pattern", "")
link_suffix = self.config.get("link_suffix", ".html")
exclude_patterns = self.config.get("exclude_patterns", [])
links = []
for a in soup.find_all('a', href=True):
href = a['href']
# 检查是否匹配模式
if link_pattern and link_pattern not in href:
continue
if link_suffix and not href.endswith(link_suffix):
continue
# 检查是否需要排除
excluded = False
for pattern in exclude_patterns:
if pattern in href:
excluded = True
break
if excluded:
continue
# 转为绝对URL
full_link = make_absolute_url(full_url, href)
if full_link not in links:
links.append(full_link)
print(f"共发现 {len(links)} 个页面链接")
return links
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
"""
提取页面标题
Args:
soup: BeautifulSoup 对象
url: 页面URL用于生成默认标题
Returns:
标题文本
"""
selector = self.config.get("title_selector", "h1")
index = self.config.get("title_index", 0)
tags = soup.find_all(selector)
if tags and len(tags) > index:
return tags[index].get_text(strip=True)
elif tags:
return tags[0].get_text(strip=True)
else:
# 使用URL最后一部分作为标题
return url.split('/')[-1].replace('.html', '')
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
"""
提取页面主内容
Args:
soup: BeautifulSoup 对象
Returns:
内容区域的 BeautifulSoup 对象,未找到返回 None
"""
selector = self.config.get("content_selector", "")
# 支持多个选择器,用逗号分隔
selectors = [s.strip() for s in selector.split(',')]
# 收集所有匹配的内容
all_contents = []
for sel in selectors:
if '.' in sel:
# class 选择器
tag, class_name = sel.split('.', 1)
tag = tag if tag else 'div'
content = soup.find(tag, class_=class_name)
else:
content = soup.find(sel)
if content:
all_contents.append(content)
# 如果没有找到任何内容
if not all_contents:
return None
# 如果只找到一个,直接返回
if len(all_contents) == 1:
return all_contents[0]
# 合并多个内容区域到一个容器
combined = soup.new_tag('div')
for content in all_contents:
# 深拷贝内容以避免从原DOM中移除
combined.append(copy.deepcopy(content))
return combined
def clean_content(self, content: BeautifulSoup) -> BeautifulSoup:
"""
清洗内容,移除无用元素
Args:
content: 内容区域
Returns:
清洗后的内容
"""
# 移除 script 和 style 标签
for tag in content(['script', 'style']):
tag.decompose()
return content
def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
"""
处理内容中的图片,下载到本地
Args:
content: 内容区域
page_url: 页面URL用于解析相对路径
Returns:
图片信息列表 [(原URL, 本地路径), ...]
"""
images_info = []
for img in content.find_all('img'):
src = img.get('src')
if not src:
continue
# 转为绝对URL
full_url = make_absolute_url(page_url, src)
# 下载图片
local_path = download_image(full_url, self.images_dir)
if local_path:
images_info.append((full_url, local_path))
# 更新 img 标签的 src 为本地相对路径
img['src'] = os.path.relpath(local_path, self.output_dir).replace('\\', '/')
else:
# 下载失败保留原URL
img['src'] = full_url
return images_info
def content_to_markdown(self, content: BeautifulSoup) -> str:
"""
将内容转换为 Markdown
Args:
content: 内容区域
Returns:
Markdown 文本
"""
return markdownify.markdownify(str(content), heading_style="ATX")
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
"""
将内容添加到 Word 文档
Args:
doc: Document 对象
content: 内容区域
output_dir: 输出目录(用于解析图片路径)
"""
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']):
if element.name == 'img':
src = element.get('src', '')
# 尝试获取本地图片路径
if not src.startswith('http'):
local_path = os.path.join(output_dir, src)
else:
local_path = src
if os.path.exists(local_path):
try:
doc.add_picture(local_path, width=Inches(5))
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
except Exception as e:
print(f" Word插入图片失败: {local_path} - {e}")
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text:
level = int(element.name[1])
doc.add_heading(text, level=min(level + 1, 9))
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text, style='List Bullet')
elif element.name == 'table':
# 简单处理表格,提取文本
for row in element.find_all('tr'):
cells = row.find_all(['td', 'th'])
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
if row_text.strip():
doc.add_paragraph(row_text)
else:
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
def crawl_page(self, url: str) -> dict | None:
"""
爬取单个页面
Args:
url: 页面URL
Returns:
页面数据字典,失败返回 None
"""
soup = self.fetch_page(url)
if not soup:
return None
# 提取标题
title = self.extract_title(soup, url)
# 提取内容
content = self.extract_content(soup)
if not content:
print(f" 警告: 页面未找到主内容区域: {url}")
return None
# 清洗内容
content = self.clean_content(content)
# 处理图片
images = self.process_images(content, url)
# 转换为 Markdown
markdown = self.content_to_markdown(content)
return {
"url": url,
"title": title,
"content": content,
"markdown": markdown,
"images": images,
}
def save_single_page(self, page_data: dict):
"""
保存单个页面为独立的 md 和 docx 文件
Args:
page_data: 页面数据字典
"""
title = page_data["title"]
safe_title = safe_filename(title)
# 保存 Markdown
md_path = os.path.join(self.output_dir, f"{safe_title}.md")
md_content = f"# {title}\n\n"
md_content += f"**原文链接**: {page_data['url']}\n\n"
md_content += page_data["markdown"]
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_content)
# 保存 Word
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
doc = Document()
doc.add_heading(title, 0)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page_data['url']}").italic = True
self.add_content_to_docx(doc, page_data["content"], self.output_dir)
doc.save(docx_path)
def save_combined_documents(self, all_pages: list[dict]):
"""
将所有页面汇总保存为一个 md 和 docx 文件
Args:
all_pages: 所有页面数据列表
"""
if not all_pages:
return
safe_name = safe_filename(self.name)
# === 生成汇总 Markdown ===
combined_md = f"# {self.name}全集\n\n"
combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。\n\n"
combined_md += "---\n\n"
# 添加每篇内容
for page in all_pages:
combined_md += f"## {page['title']}\n\n"
combined_md += f"**原文链接**: {page['url']}\n\n"
combined_md += page["markdown"]
combined_md += "\n\n---\n\n"
md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write(combined_md)
print(f" 汇总 Markdown: {md_path}")
# === 生成汇总 Word 文档 ===
doc = Document()
doc.add_heading(f'{self.name}全集', 0)
intro = doc.add_paragraph()
intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True
doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。")
doc.add_page_break()
# 添加每篇内容
for page in all_pages:
doc.add_heading(page["title"], level=1)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page['url']}").italic = True
self.add_content_to_docx(doc, page["content"], self.output_dir)
doc.add_page_break()
docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
doc.save(docx_path)
print(f" 汇总 Word: {docx_path}")
def run(self):
"""
执行爬取任务
"""
print(f"\n{'='*60}")
print(f"开始爬取: {self.name}")
print(f"{'='*60}")
# 获取页面链接
if "static_pages" in self.config:
# 静态页面列表
links = [make_absolute_url(BASE_URL, p) for p in self.config["static_pages"]]
elif "index_url" in self.config:
# 从索引页提取
links = self.get_links_from_index(self.config["index_url"])
else:
print("错误: 配置中未指定 static_pages 或 index_url")
return
if not links:
print("未获取到链接,跳过此任务")
return
# 爬取每个页面
all_pages = [] # 存储所有成功爬取的页面数据
for i, url in enumerate(links):
print(f"[{i+1}/{len(links)}] 正在抓取: {url}")
page_data = self.crawl_page(url)
if page_data:
all_pages.append(page_data)
# 请求延迟
time.sleep(REQUEST_DELAY)
# 生成汇总文档
if all_pages:
print(f"\n正在生成汇总文档(共 {len(all_pages)} 篇)...")
self.save_combined_documents(all_pages)
print(f"\n{self.name} 爬取完成!成功: {len(all_pages)}/{len(links)}")
print(f"输出目录: {self.output_dir}")
class StandardCrawler(BaseCrawler):
"""
标准爬虫类
适用于大多数页面类型
"""
pass