初始提交:零差云控官网爬虫项目

This commit is contained in:
oy2020
2026-01-29 17:29:14 +08:00
commit 51b67b9e68
406 changed files with 14247 additions and 0 deletions

View File

@@ -0,0 +1,460 @@
"""
基础爬虫类
提供通用的爬取逻辑
"""
import os
import time
import copy
import requests
from bs4 import BeautifulSoup
import markdownify
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from urllib.parse import urljoin
from abc import ABC, abstractmethod
from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
class BaseCrawler(ABC):
"""
基础爬虫类
提供通用的页面获取、内容提取、文档生成功能
"""
def __init__(self, task_config: dict):
"""
初始化爬虫
Args:
task_config: 任务配置字典
"""
self.config = task_config
self.name = task_config.get("name", "未命名")
self.session = requests.Session()
self.session.headers.update(HEADERS)
# 输出目录
self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name))
self.images_dir = os.path.join(self.output_dir, "images")
ensure_dir(self.output_dir)
ensure_dir(self.images_dir)
def fetch_page(self, url: str) -> BeautifulSoup | None:
"""
获取页面内容
Args:
url: 页面URL
Returns:
BeautifulSoup 对象,失败返回 None
"""
try:
response = self.session.get(url, timeout=30)
response.encoding = 'utf-8'
return BeautifulSoup(response.text, 'html.parser')
except Exception as e:
print(f" 获取页面失败: {url} - {e}")
return None
def get_links_from_index(self, index_url: str) -> list[str]:
"""
从索引页提取子页面链接
Args:
index_url: 索引页URL
Returns:
链接列表
"""
full_url = make_absolute_url(BASE_URL, index_url)
print(f"正在从索引页提取链接: {full_url}")
soup = self.fetch_page(full_url)
if not soup:
return []
link_pattern = self.config.get("link_pattern", "")
link_suffix = self.config.get("link_suffix", ".html")
exclude_patterns = self.config.get("exclude_patterns", [])
links = []
for a in soup.find_all('a', href=True):
href = a['href']
# 检查是否匹配模式
if link_pattern and link_pattern not in href:
continue
if link_suffix and not href.endswith(link_suffix):
continue
# 检查是否需要排除
excluded = False
for pattern in exclude_patterns:
if pattern in href:
excluded = True
break
if excluded:
continue
# 转为绝对URL
full_link = make_absolute_url(full_url, href)
if full_link not in links:
links.append(full_link)
print(f"共发现 {len(links)} 个页面链接")
return links
def extract_title(self, soup: BeautifulSoup, url: str) -> str:
"""
提取页面标题
Args:
soup: BeautifulSoup 对象
url: 页面URL用于生成默认标题
Returns:
标题文本
"""
selector = self.config.get("title_selector", "h1")
index = self.config.get("title_index", 0)
tags = soup.find_all(selector)
if tags and len(tags) > index:
return tags[index].get_text(strip=True)
elif tags:
return tags[0].get_text(strip=True)
else:
# 使用URL最后一部分作为标题
return url.split('/')[-1].replace('.html', '')
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
"""
提取页面主内容
Args:
soup: BeautifulSoup 对象
Returns:
内容区域的 BeautifulSoup 对象,未找到返回 None
"""
selector = self.config.get("content_selector", "")
# 支持多个选择器,用逗号分隔
selectors = [s.strip() for s in selector.split(',')]
# 收集所有匹配的内容
all_contents = []
for sel in selectors:
if '.' in sel:
# class 选择器
tag, class_name = sel.split('.', 1)
tag = tag if tag else 'div'
content = soup.find(tag, class_=class_name)
else:
content = soup.find(sel)
if content:
all_contents.append(content)
# 如果没有找到任何内容
if not all_contents:
return None
# 如果只找到一个,直接返回
if len(all_contents) == 1:
return all_contents[0]
# 合并多个内容区域到一个容器
combined = soup.new_tag('div')
for content in all_contents:
# 深拷贝内容以避免从原DOM中移除
combined.append(copy.deepcopy(content))
return combined
def clean_content(self, content: BeautifulSoup) -> BeautifulSoup:
"""
清洗内容,移除无用元素
Args:
content: 内容区域
Returns:
清洗后的内容
"""
# 移除 script 和 style 标签
for tag in content(['script', 'style']):
tag.decompose()
return content
def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
"""
处理内容中的图片,下载到本地
Args:
content: 内容区域
page_url: 页面URL用于解析相对路径
Returns:
图片信息列表 [(原URL, 本地路径), ...]
"""
images_info = []
for img in content.find_all('img'):
src = img.get('src')
if not src:
continue
# 转为绝对URL
full_url = make_absolute_url(page_url, src)
# 下载图片
local_path = download_image(full_url, self.images_dir)
if local_path:
images_info.append((full_url, local_path))
# 更新 img 标签的 src 为本地相对路径
img['src'] = os.path.relpath(local_path, self.output_dir).replace('\\', '/')
else:
# 下载失败保留原URL
img['src'] = full_url
return images_info
def content_to_markdown(self, content: BeautifulSoup) -> str:
"""
将内容转换为 Markdown
Args:
content: 内容区域
Returns:
Markdown 文本
"""
return markdownify.markdownify(str(content), heading_style="ATX")
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
"""
将内容添加到 Word 文档
Args:
doc: Document 对象
content: 内容区域
output_dir: 输出目录(用于解析图片路径)
"""
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']):
if element.name == 'img':
src = element.get('src', '')
# 尝试获取本地图片路径
if not src.startswith('http'):
local_path = os.path.join(output_dir, src)
else:
local_path = src
if os.path.exists(local_path):
try:
doc.add_picture(local_path, width=Inches(5))
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
except Exception as e:
print(f" Word插入图片失败: {local_path} - {e}")
elif element.name.startswith('h'):
text = element.get_text(strip=True)
if text:
level = int(element.name[1])
doc.add_heading(text, level=min(level + 1, 9))
elif element.name == 'li':
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text, style='List Bullet')
elif element.name == 'table':
# 简单处理表格,提取文本
for row in element.find_all('tr'):
cells = row.find_all(['td', 'th'])
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
if row_text.strip():
doc.add_paragraph(row_text)
else:
text = element.get_text(strip=True)
if text:
doc.add_paragraph(text)
def crawl_page(self, url: str) -> dict | None:
"""
爬取单个页面
Args:
url: 页面URL
Returns:
页面数据字典,失败返回 None
"""
soup = self.fetch_page(url)
if not soup:
return None
# 提取标题
title = self.extract_title(soup, url)
# 提取内容
content = self.extract_content(soup)
if not content:
print(f" 警告: 页面未找到主内容区域: {url}")
return None
# 清洗内容
content = self.clean_content(content)
# 处理图片
images = self.process_images(content, url)
# 转换为 Markdown
markdown = self.content_to_markdown(content)
return {
"url": url,
"title": title,
"content": content,
"markdown": markdown,
"images": images,
}
def save_single_page(self, page_data: dict):
"""
保存单个页面为独立的 md 和 docx 文件
Args:
page_data: 页面数据字典
"""
title = page_data["title"]
safe_title = safe_filename(title)
# 保存 Markdown
md_path = os.path.join(self.output_dir, f"{safe_title}.md")
md_content = f"# {title}\n\n"
md_content += f"**原文链接**: {page_data['url']}\n\n"
md_content += page_data["markdown"]
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_content)
# 保存 Word
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
doc = Document()
doc.add_heading(title, 0)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page_data['url']}").italic = True
self.add_content_to_docx(doc, page_data["content"], self.output_dir)
doc.save(docx_path)
def save_combined_documents(self, all_pages: list[dict]):
"""
将所有页面汇总保存为一个 md 和 docx 文件
Args:
all_pages: 所有页面数据列表
"""
if not all_pages:
return
safe_name = safe_filename(self.name)
# === 生成汇总 Markdown ===
combined_md = f"# {self.name}全集\n\n"
combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。\n\n"
combined_md += "---\n\n"
# 添加每篇内容
for page in all_pages:
combined_md += f"## {page['title']}\n\n"
combined_md += f"**原文链接**: {page['url']}\n\n"
combined_md += page["markdown"]
combined_md += "\n\n---\n\n"
md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write(combined_md)
print(f" 汇总 Markdown: {md_path}")
# === 生成汇总 Word 文档 ===
doc = Document()
doc.add_heading(f'{self.name}全集', 0)
intro = doc.add_paragraph()
intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True
doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。")
doc.add_page_break()
# 添加每篇内容
for page in all_pages:
doc.add_heading(page["title"], level=1)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page['url']}").italic = True
self.add_content_to_docx(doc, page["content"], self.output_dir)
doc.add_page_break()
docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
doc.save(docx_path)
print(f" 汇总 Word: {docx_path}")
def run(self):
"""
执行爬取任务
"""
print(f"\n{'='*60}")
print(f"开始爬取: {self.name}")
print(f"{'='*60}")
# 获取页面链接
if "static_pages" in self.config:
# 静态页面列表
links = [make_absolute_url(BASE_URL, p) for p in self.config["static_pages"]]
elif "index_url" in self.config:
# 从索引页提取
links = self.get_links_from_index(self.config["index_url"])
else:
print("错误: 配置中未指定 static_pages 或 index_url")
return
if not links:
print("未获取到链接,跳过此任务")
return
# 爬取每个页面
all_pages = [] # 存储所有成功爬取的页面数据
for i, url in enumerate(links):
print(f"[{i+1}/{len(links)}] 正在抓取: {url}")
page_data = self.crawl_page(url)
if page_data:
all_pages.append(page_data)
# 请求延迟
time.sleep(REQUEST_DELAY)
# 生成汇总文档
if all_pages:
print(f"\n正在生成汇总文档(共 {len(all_pages)} 篇)...")
self.save_combined_documents(all_pages)
print(f"\n{self.name} 爬取完成!成功: {len(all_pages)}/{len(links)}")
print(f"输出目录: {self.output_dir}")
class StandardCrawler(BaseCrawler):
"""
标准爬虫类
适用于大多数页面类型
"""
pass