""" 基础爬虫类 提供通用的爬取逻辑 """ import os import time import copy import requests from bs4 import BeautifulSoup import markdownify from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from urllib.parse import urljoin from abc import ABC, abstractmethod from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR from .utils import ensure_dir, download_image, safe_filename, make_absolute_url class BaseCrawler(ABC): """ 基础爬虫类 提供通用的页面获取、内容提取、文档生成功能 """ def __init__(self, task_config: dict): """ 初始化爬虫 Args: task_config: 任务配置字典 """ self.config = task_config self.name = task_config.get("name", "未命名") self.session = requests.Session() self.session.headers.update(HEADERS) # 输出目录 self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name)) self.images_dir = os.path.join(self.output_dir, "images") ensure_dir(self.output_dir) ensure_dir(self.images_dir) def fetch_page(self, url: str) -> BeautifulSoup | None: """ 获取页面内容 Args: url: 页面URL Returns: BeautifulSoup 对象,失败返回 None """ try: response = self.session.get(url, timeout=30) response.encoding = 'utf-8' return BeautifulSoup(response.text, 'html.parser') except Exception as e: print(f" 获取页面失败: {url} - {e}") return None def get_links_from_index(self, index_url: str) -> list[str]: """ 从索引页提取子页面链接 Args: index_url: 索引页URL Returns: 链接列表 """ full_url = make_absolute_url(BASE_URL, index_url) print(f"正在从索引页提取链接: {full_url}") soup = self.fetch_page(full_url) if not soup: return [] link_pattern = self.config.get("link_pattern", "") link_suffix = self.config.get("link_suffix", ".html") exclude_patterns = self.config.get("exclude_patterns", []) links = [] for a in soup.find_all('a', href=True): href = a['href'] # 检查是否匹配模式 if link_pattern and link_pattern not in href: continue if link_suffix and not href.endswith(link_suffix): continue # 检查是否需要排除 excluded = False for pattern in exclude_patterns: if pattern in href: excluded = True break if excluded: continue # 转为绝对URL full_link = make_absolute_url(full_url, href) if full_link not in links: links.append(full_link) print(f"共发现 {len(links)} 个页面链接") return links def extract_title(self, soup: BeautifulSoup, url: str) -> str: """ 提取页面标题 Args: soup: BeautifulSoup 对象 url: 页面URL(用于生成默认标题) Returns: 标题文本 """ selector = self.config.get("title_selector", "h1") index = self.config.get("title_index", 0) tags = soup.find_all(selector) if tags and len(tags) > index: return tags[index].get_text(strip=True) elif tags: return tags[0].get_text(strip=True) else: # 使用URL最后一部分作为标题 return url.split('/')[-1].replace('.html', '') def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None: """ 提取页面主内容 Args: soup: BeautifulSoup 对象 Returns: 内容区域的 BeautifulSoup 对象,未找到返回 None """ selector = self.config.get("content_selector", "") # 支持多个选择器,用逗号分隔 selectors = [s.strip() for s in selector.split(',')] # 收集所有匹配的内容 all_contents = [] for sel in selectors: if '.' in sel: # class 选择器 tag, class_name = sel.split('.', 1) tag = tag if tag else 'div' content = soup.find(tag, class_=class_name) else: content = soup.find(sel) if content: all_contents.append(content) # 如果没有找到任何内容 if not all_contents: return None # 如果只找到一个,直接返回 if len(all_contents) == 1: return all_contents[0] # 合并多个内容区域到一个容器 combined = soup.new_tag('div') for content in all_contents: # 深拷贝内容以避免从原DOM中移除 combined.append(copy.deepcopy(content)) return combined def clean_content(self, content: BeautifulSoup) -> BeautifulSoup: """ 清洗内容,移除无用元素 Args: content: 内容区域 Returns: 清洗后的内容 """ # 移除 script 和 style 标签 for tag in content(['script', 'style']): tag.decompose() return content def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]: """ 处理内容中的图片,下载到本地 Args: content: 内容区域 page_url: 页面URL(用于解析相对路径) Returns: 图片信息列表 [(原URL, 本地路径), ...] """ images_info = [] for img in content.find_all('img'): src = img.get('src') if not src: continue # 转为绝对URL full_url = make_absolute_url(page_url, src) # 下载图片 local_path = download_image(full_url, self.images_dir) if local_path: images_info.append((full_url, local_path)) # 更新 img 标签的 src 为本地相对路径 img['src'] = os.path.relpath(local_path, self.output_dir).replace('\\', '/') else: # 下载失败,保留原URL img['src'] = full_url return images_info def content_to_markdown(self, content: BeautifulSoup) -> str: """ 将内容转换为 Markdown Args: content: 内容区域 Returns: Markdown 文本 """ return markdownify.markdownify(str(content), heading_style="ATX") def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str): """ 将内容添加到 Word 文档 Args: doc: Document 对象 content: 内容区域 output_dir: 输出目录(用于解析图片路径) """ for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']): if element.name == 'img': src = element.get('src', '') # 尝试获取本地图片路径 if not src.startswith('http'): local_path = os.path.join(output_dir, src) else: local_path = src if os.path.exists(local_path): try: doc.add_picture(local_path, width=Inches(5)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER except Exception as e: print(f" Word插入图片失败: {local_path} - {e}") elif element.name.startswith('h'): text = element.get_text(strip=True) if text: level = int(element.name[1]) doc.add_heading(text, level=min(level + 1, 9)) elif element.name == 'li': text = element.get_text(strip=True) if text: doc.add_paragraph(text, style='List Bullet') elif element.name == 'table': # 简单处理表格,提取文本 for row in element.find_all('tr'): cells = row.find_all(['td', 'th']) row_text = ' | '.join([cell.get_text(strip=True) for cell in cells]) if row_text.strip(): doc.add_paragraph(row_text) else: text = element.get_text(strip=True) if text: doc.add_paragraph(text) def crawl_page(self, url: str) -> dict | None: """ 爬取单个页面 Args: url: 页面URL Returns: 页面数据字典,失败返回 None """ soup = self.fetch_page(url) if not soup: return None # 提取标题 title = self.extract_title(soup, url) # 提取内容 content = self.extract_content(soup) if not content: print(f" 警告: 页面未找到主内容区域: {url}") return None # 清洗内容 content = self.clean_content(content) # 处理图片 images = self.process_images(content, url) # 转换为 Markdown markdown = self.content_to_markdown(content) return { "url": url, "title": title, "content": content, "markdown": markdown, "images": images, } def save_single_page(self, page_data: dict): """ 保存单个页面为独立的 md 和 docx 文件 Args: page_data: 页面数据字典 """ title = page_data["title"] safe_title = safe_filename(title) # 保存 Markdown md_path = os.path.join(self.output_dir, f"{safe_title}.md") md_content = f"# {title}\n\n" md_content += f"**原文链接**: {page_data['url']}\n\n" md_content += page_data["markdown"] with open(md_path, "w", encoding="utf-8") as f: f.write(md_content) # 保存 Word docx_path = os.path.join(self.output_dir, f"{safe_title}.docx") doc = Document() doc.add_heading(title, 0) p = doc.add_paragraph() p.add_run(f"原文链接: {page_data['url']}").italic = True self.add_content_to_docx(doc, page_data["content"], self.output_dir) doc.save(docx_path) def save_combined_documents(self, all_pages: list[dict]): """ 将所有页面汇总保存为一个 md 和 docx 文件 Args: all_pages: 所有页面数据列表 """ if not all_pages: return safe_name = safe_filename(self.name) # === 生成汇总 Markdown === combined_md = f"# {self.name}全集\n\n" combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n" combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。\n\n" combined_md += "---\n\n" # 添加每篇内容 for page in all_pages: combined_md += f"## {page['title']}\n\n" combined_md += f"**原文链接**: {page['url']}\n\n" combined_md += page["markdown"] combined_md += "\n\n---\n\n" md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md") with open(md_path, "w", encoding="utf-8") as f: f.write(combined_md) print(f" 汇总 Markdown: {md_path}") # === 生成汇总 Word 文档 === doc = Document() doc.add_heading(f'{self.name}全集', 0) intro = doc.add_paragraph() intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。") doc.add_page_break() # 添加每篇内容 for page in all_pages: doc.add_heading(page["title"], level=1) p = doc.add_paragraph() p.add_run(f"原文链接: {page['url']}").italic = True self.add_content_to_docx(doc, page["content"], self.output_dir) doc.add_page_break() docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx") doc.save(docx_path) print(f" 汇总 Word: {docx_path}") def run(self): """ 执行爬取任务 """ print(f"\n{'='*60}") print(f"开始爬取: {self.name}") print(f"{'='*60}") # 获取页面链接 if "static_pages" in self.config: # 静态页面列表 links = [make_absolute_url(BASE_URL, p) for p in self.config["static_pages"]] elif "index_url" in self.config: # 从索引页提取 links = self.get_links_from_index(self.config["index_url"]) else: print("错误: 配置中未指定 static_pages 或 index_url") return if not links: print("未获取到链接,跳过此任务") return # 爬取每个页面 all_pages = [] # 存储所有成功爬取的页面数据 for i, url in enumerate(links): print(f"[{i+1}/{len(links)}] 正在抓取: {url}") page_data = self.crawl_page(url) if page_data: all_pages.append(page_data) # 请求延迟 time.sleep(REQUEST_DELAY) # 生成汇总文档 if all_pages: print(f"\n正在生成汇总文档(共 {len(all_pages)} 篇)...") self.save_combined_documents(all_pages) print(f"\n{self.name} 爬取完成!成功: {len(all_pages)}/{len(links)}") print(f"输出目录: {self.output_dir}") class StandardCrawler(BaseCrawler): """ 标准爬虫类 适用于大多数页面类型 """ pass