初始提交：零差云控官网爬虫项目

2026-01-29 17:29:14 +08:00
commit 51b67b9e68
406 changed files with 14247 additions and 0 deletions
--- a/zeroerr_crawler/init.py
+++ b/zeroerr_crawler/init.py
@@ -0,0 +1,3 @@
+# ZeroErr 零差云控官网爬虫
+# 用于爬取网站内容生成 RAGFlow 知识库文档
+
--- a/zeroerr_crawler/base_crawler.py
+++ b/zeroerr_crawler/base_crawler.py
@@ -0,0 +1,460 @@
+"""
+基础爬虫类
+提供通用的爬取逻辑
+"""
+
+import os
+import time
+import copy
+import requests
+from bs4 import BeautifulSoup
+import markdownify
+from docx import Document
+from docx.shared import Inches
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from urllib.parse import urljoin
+from abc import ABC, abstractmethod
+
+from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
+from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
+
+
+class BaseCrawler(ABC):
+    """
+    基础爬虫类
+    提供通用的页面获取、内容提取、文档生成功能
+    """
+    
+    def __init__(self, task_config: dict):
+        """
+        初始化爬虫
+        
+        Args:
+            task_config: 任务配置字典
+        """
+        self.config = task_config
+        self.name = task_config.get("name", "未命名")
+        self.session = requests.Session()
+        self.session.headers.update(HEADERS)
+        
+        # 输出目录
+        self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name))
+        self.images_dir = os.path.join(self.output_dir, "images")
+        ensure_dir(self.output_dir)
+        ensure_dir(self.images_dir)
+    
+    def fetch_page(self, url: str) -> BeautifulSoup | None:
+        """
+        获取页面内容
+        
+        Args:
+            url: 页面URL
+        
+        Returns:
+            BeautifulSoup 对象，失败返回 None
+        """
+        try:
+            response = self.session.get(url, timeout=30)
+            response.encoding = 'utf-8'
+            return BeautifulSoup(response.text, 'html.parser')
+        except Exception as e:
+            print(f"  获取页面失败: {url} - {e}")
+            return None
+    
+    def get_links_from_index(self, index_url: str) -> list[str]:
+        """
+        从索引页提取子页面链接
+        
+        Args:
+            index_url: 索引页URL
+        
+        Returns:
+            链接列表
+        """
+        full_url = make_absolute_url(BASE_URL, index_url)
+        print(f"正在从索引页提取链接: {full_url}")
+        
+        soup = self.fetch_page(full_url)
+        if not soup:
+            return []
+        
+        link_pattern = self.config.get("link_pattern", "")
+        link_suffix = self.config.get("link_suffix", ".html")
+        exclude_patterns = self.config.get("exclude_patterns", [])
+        
+        links = []
+        for a in soup.find_all('a', href=True):
+            href = a['href']
+            
+            # 检查是否匹配模式
+            if link_pattern and link_pattern not in href:
+                continue
+            if link_suffix and not href.endswith(link_suffix):
+                continue
+            
+            # 检查是否需要排除
+            excluded = False
+            for pattern in exclude_patterns:
+                if pattern in href:
+                    excluded = True
+                    break
+            if excluded:
+                continue
+            
+            # 转为绝对URL
+            full_link = make_absolute_url(full_url, href)
+            if full_link not in links:
+                links.append(full_link)
+        
+        print(f"共发现 {len(links)} 个页面链接")
+        return links
+    
+    def extract_title(self, soup: BeautifulSoup, url: str) -> str:
+        """
+        提取页面标题
+        
+        Args:
+            soup: BeautifulSoup 对象
+            url: 页面URL（用于生成默认标题）
+        
+        Returns:
+            标题文本
+        """
+        selector = self.config.get("title_selector", "h1")
+        index = self.config.get("title_index", 0)
+        
+        tags = soup.find_all(selector)
+        if tags and len(tags) > index:
+            return tags[index].get_text(strip=True)
+        elif tags:
+            return tags[0].get_text(strip=True)
+        else:
+            # 使用URL最后一部分作为标题
+            return url.split('/')[-1].replace('.html', '')
+    
+    def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
+        """
+        提取页面主内容
+        
+        Args:
+            soup: BeautifulSoup 对象
+        
+        Returns:
+            内容区域的 BeautifulSoup 对象，未找到返回 None
+        """
+        selector = self.config.get("content_selector", "")
+        
+        # 支持多个选择器，用逗号分隔
+        selectors = [s.strip() for s in selector.split(',')]
+        
+        # 收集所有匹配的内容
+        all_contents = []
+        
+        for sel in selectors:
+            if '.' in sel:
+                # class 选择器
+                tag, class_name = sel.split('.', 1)
+                tag = tag if tag else 'div'
+                content = soup.find(tag, class_=class_name)
+            else:
+                content = soup.find(sel)
+            
+            if content:
+                all_contents.append(content)
+        
+        # 如果没有找到任何内容
+        if not all_contents:
+            return None
+        
+        # 如果只找到一个，直接返回
+        if len(all_contents) == 1:
+            return all_contents[0]
+        
+        # 合并多个内容区域到一个容器
+        combined = soup.new_tag('div')
+        for content in all_contents:
+            # 深拷贝内容以避免从原DOM中移除
+            combined.append(copy.deepcopy(content))
+        
+        return combined
+    
+    def clean_content(self, content: BeautifulSoup) -> BeautifulSoup:
+        """
+        清洗内容，移除无用元素
+        
+        Args:
+            content: 内容区域
+        
+        Returns:
+            清洗后的内容
+        """
+        # 移除 script 和 style 标签
+        for tag in content(['script', 'style']):
+            tag.decompose()
+        
+        return content
+    
+    def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
+        """
+        处理内容中的图片，下载到本地
+        
+        Args:
+            content: 内容区域
+            page_url: 页面URL（用于解析相对路径）
+        
+        Returns:
+            图片信息列表 [(原URL, 本地路径), ...]
+        """
+        images_info = []
+        
+        for img in content.find_all('img'):
+            src = img.get('src')
+            if not src:
+                continue
+            
+            # 转为绝对URL
+            full_url = make_absolute_url(page_url, src)
+            
+            # 下载图片
+            local_path = download_image(full_url, self.images_dir)
+            
+            if local_path:
+                images_info.append((full_url, local_path))
+                # 更新 img 标签的 src 为本地相对路径
+                img['src'] = os.path.relpath(local_path, self.output_dir).replace('\\', '/')
+            else:
+                # 下载失败，保留原URL
+                img['src'] = full_url
+        
+        return images_info
+    
+    def content_to_markdown(self, content: BeautifulSoup) -> str:
+        """
+        将内容转换为 Markdown
+        
+        Args:
+            content: 内容区域
+        
+        Returns:
+            Markdown 文本
+        """
+        return markdownify.markdownify(str(content), heading_style="ATX")
+    
+    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
+        """
+        将内容添加到 Word 文档
+        
+        Args:
+            doc: Document 对象
+            content: 内容区域
+            output_dir: 输出目录（用于解析图片路径）
+        """
+        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']):
+            if element.name == 'img':
+                src = element.get('src', '')
+                # 尝试获取本地图片路径
+                if not src.startswith('http'):
+                    local_path = os.path.join(output_dir, src)
+                else:
+                    local_path = src
+                
+                if os.path.exists(local_path):
+                    try:
+                        doc.add_picture(local_path, width=Inches(5))
+                        doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    except Exception as e:
+                        print(f"  Word插入图片失败: {local_path} - {e}")
+            elif element.name.startswith('h'):
+                text = element.get_text(strip=True)
+                if text:
+                    level = int(element.name[1])
+                    doc.add_heading(text, level=min(level + 1, 9))
+            elif element.name == 'li':
+                text = element.get_text(strip=True)
+                if text:
+                    doc.add_paragraph(text, style='List Bullet')
+            elif element.name == 'table':
+                # 简单处理表格，提取文本
+                for row in element.find_all('tr'):
+                    cells = row.find_all(['td', 'th'])
+                    row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
+                    if row_text.strip():
+                        doc.add_paragraph(row_text)
+            else:
+                text = element.get_text(strip=True)
+                if text:
+                    doc.add_paragraph(text)
+    
+    def crawl_page(self, url: str) -> dict | None:
+        """
+        爬取单个页面
+        
+        Args:
+            url: 页面URL
+        
+        Returns:
+            页面数据字典，失败返回 None
+        """
+        soup = self.fetch_page(url)
+        if not soup:
+            return None
+        
+        # 提取标题
+        title = self.extract_title(soup, url)
+        
+        # 提取内容
+        content = self.extract_content(soup)
+        if not content:
+            print(f"  警告: 页面未找到主内容区域: {url}")
+            return None
+        
+        # 清洗内容
+        content = self.clean_content(content)
+        
+        # 处理图片
+        images = self.process_images(content, url)
+        
+        # 转换为 Markdown
+        markdown = self.content_to_markdown(content)
+        
+        return {
+            "url": url,
+            "title": title,
+            "content": content,
+            "markdown": markdown,
+            "images": images,
+        }
+    
+    def save_single_page(self, page_data: dict):
+        """
+        保存单个页面为独立的 md 和 docx 文件
+        
+        Args:
+            page_data: 页面数据字典
+        """
+        title = page_data["title"]
+        safe_title = safe_filename(title)
+        
+        # 保存 Markdown
+        md_path = os.path.join(self.output_dir, f"{safe_title}.md")
+        md_content = f"# {title}\n\n"
+        md_content += f"**原文链接**: {page_data['url']}\n\n"
+        md_content += page_data["markdown"]
+        
+        with open(md_path, "w", encoding="utf-8") as f:
+            f.write(md_content)
+        
+        # 保存 Word
+        docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
+        doc = Document()
+        doc.add_heading(title, 0)
+        p = doc.add_paragraph()
+        p.add_run(f"原文链接: {page_data['url']}").italic = True
+        
+        self.add_content_to_docx(doc, page_data["content"], self.output_dir)
+        doc.save(docx_path)
+    
+    def save_combined_documents(self, all_pages: list[dict]):
+        """
+        将所有页面汇总保存为一个 md 和 docx 文件
+        
+        Args:
+            all_pages: 所有页面数据列表
+        """
+        if not all_pages:
+            return
+        
+        safe_name = safe_filename(self.name)
+        
+        # === 生成汇总 Markdown ===
+        combined_md = f"# {self.name}全集\n\n"
+        combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+        combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容，共 {len(all_pages)} 篇。\n\n"
+        combined_md += "---\n\n"
+        
+        # 添加每篇内容
+        for page in all_pages:
+            combined_md += f"## {page['title']}\n\n"
+            combined_md += f"**原文链接**: {page['url']}\n\n"
+            combined_md += page["markdown"]
+            combined_md += "\n\n---\n\n"
+        
+        md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
+        with open(md_path, "w", encoding="utf-8") as f:
+            f.write(combined_md)
+        print(f"  汇总 Markdown: {md_path}")
+        
+        # === 生成汇总 Word 文档 ===
+        doc = Document()
+        doc.add_heading(f'{self.name}全集', 0)
+        
+        intro = doc.add_paragraph()
+        intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True
+        doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容，共 {len(all_pages)} 篇。")
+        doc.add_page_break()
+        
+        # 添加每篇内容
+        for page in all_pages:
+            doc.add_heading(page["title"], level=1)
+            p = doc.add_paragraph()
+            p.add_run(f"原文链接: {page['url']}").italic = True
+            
+            self.add_content_to_docx(doc, page["content"], self.output_dir)
+            doc.add_page_break()
+        
+        docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
+        doc.save(docx_path)
+        print(f"  汇总 Word: {docx_path}")
+    
+    def run(self):
+        """
+        执行爬取任务
+        """
+        print(f"\n{'='*60}")
+        print(f"开始爬取: {self.name}")
+        print(f"{'='*60}")
+        
+        # 获取页面链接
+        if "static_pages" in self.config:
+            # 静态页面列表
+            links = [make_absolute_url(BASE_URL, p) for p in self.config["static_pages"]]
+        elif "index_url" in self.config:
+            # 从索引页提取
+            links = self.get_links_from_index(self.config["index_url"])
+        else:
+            print("错误: 配置中未指定 static_pages 或 index_url")
+            return
+        
+        if not links:
+            print("未获取到链接，跳过此任务")
+            return
+        
+        # 爬取每个页面
+        all_pages = []  # 存储所有成功爬取的页面数据
+        
+        for i, url in enumerate(links):
+            print(f"[{i+1}/{len(links)}] 正在抓取: {url}")
+            
+            page_data = self.crawl_page(url)
+            if page_data:
+                all_pages.append(page_data)
+            
+            # 请求延迟
+            time.sleep(REQUEST_DELAY)
+        
+        # 生成汇总文档
+        if all_pages:
+            print(f"\n正在生成汇总文档（共 {len(all_pages)} 篇）...")
+            self.save_combined_documents(all_pages)
+        
+        print(f"\n{self.name} 爬取完成！成功: {len(all_pages)}/{len(links)}")
+        print(f"输出目录: {self.output_dir}")
+
+
+class StandardCrawler(BaseCrawler):
+    """
+    标准爬虫类
+    适用于大多数页面类型
+    """
+    pass
+
--- a/zeroerr_crawler/config.py
+++ b/zeroerr_crawler/config.py
@@ -0,0 +1,133 @@
+"""
+爬虫配置文件
+定义所有爬取任务的配置
+"""
+
+BASE_URL = "https://www.zeroerr.cn"
+
+# 请求头配置
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+}
+
+# 请求延迟（秒），保护服务器
+REQUEST_DELAY = 0.5
+
+# 输出目录
+OUTPUT_DIR = "output"
+
+# 爬取任务配置
+# 每个任务定义：名称、索引页、链接筛选规则、内容选择器、标题选择器
+CRAWL_TASKS = {
+    # 应用案例
+    "case": {
+        "name": "应用案例",
+        "index_url": "/case/index.html",
+        "link_pattern": "/case/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.news_text_p",
+        "title_selector": "h1",
+        "title_index": 1,  # 使用第二个h1标签
+    },
+    # 常见问题
+    "issue": {
+        "name": "常见问题",
+        "index_url": "/issue/index.html",
+        "link_pattern": "/issue/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.news_text_p",
+        "title_selector": "h1",
+        "title_index": 1,
+    },
+    # 企业新闻
+    "news": {
+        "name": "企业新闻",
+        "index_url": "/news/index.html",
+        "link_pattern": "/news/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.news_text_p",
+        "title_selector": "h1",
+        "title_index": 1,
+    },
+    # 认证与资质
+    "certification": {
+        "name": "认证与资质",
+        "index_url": "/Certification/index.html",
+        "link_pattern": "/Certification/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.news_text_p",
+        "title_selector": "h1",
+        "title_index": 1,
+    },
+    # 机器人关节产品
+    "erob": {
+        "name": "机器人关节",
+        "index_url": "/eRob/index.html",
+        "link_pattern": "/eRob/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.product_text_l,div.product_text",  # 产品页面左侧/整体内容区
+        "title_selector": "h1",
+        "title_index": 0,
+    },
+    # 编码器产品
+    "ecoder": {
+        "name": "编码器",
+        "index_url": "/eCoder/index.html",
+        "link_pattern": "/eCoder/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.product_text_l,div.product_text",  # 产品页面左侧/整体内容区
+        "title_selector": "h1",
+        "title_index": 0,
+    },
+    # 配件
+    "tools": {
+        "name": "配件",
+        "index_url": "/Tools/index.html",
+        "link_pattern": "/Tools/",
+        "link_suffix": ".html",
+        "exclude_patterns": ["index.html"],
+        "content_selector": "div.product_text_l,div.product_text_l1,div.product_text,div.news_text_p,div.eLiner_banner,div.web_cable_container",  # 多种布局
+        "title_selector": "h1,h2",  # 部分页面标题用h2
+        "title_index": 0,
+    },
+    # 关于我们等静态页面
+    "about": {
+        "name": "关于我们",
+        "static_pages": [
+            "/about/about-us.html",
+            "/about/contact-us.html",
+            "/about/join-us.html",
+            "/about/152.html",  # 诚招代理
+        ],
+        "content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact",  # 多区域布局
+        "title_selector": "h1,h2",
+        "title_index": 0,
+    },
+    # 服务与支持（单页面，直接抓取内容）
+    "support": {
+        "name": "服务与支持",
+        "static_pages": [
+            "/support/",  # 主页面包含所有内容
+        ],
+        "content_selector": "div.sidebar_container,div.content,div.content-section,div.news_text_p",
+        "title_selector": "h2",
+        "title_index": 0,
+    },
+    # 资料下载（静态页面）
+    "download": {
+        "name": "资料下载",
+        "static_pages": [
+            "/download/77.html",  # 资料下载说明页
+        ],
+        "content_selector": "div.news_text_p,div.news_text",
+        "title_selector": "h1",
+        "title_index": 0,
+    },
+}
+
--- a/zeroerr_crawler/product_crawler.py
+++ b/zeroerr_crawler/product_crawler.py
@@ -0,0 +1,152 @@
+"""
+产品页面爬虫
+专门处理 eRob 机器人关节和 eCoder 编码器的产品详情页
+"""
+
+from bs4 import BeautifulSoup
+from docx import Document
+from docx.shared import Inches
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+import os
+
+from .base_crawler import BaseCrawler
+from .utils import safe_filename
+
+
+class ProductCrawler(BaseCrawler):
+    """
+    产品页面爬虫
+    针对 eRob 和 eCoder 产品页面的特殊处理
+    """
+    
+    def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
+        """
+        提取产品页面主内容
+        产品页面结构较复杂，需要特殊处理
+        """
+        # 尝试多种选择器
+        selectors = [
+            ('div', 'eRob_page_right'),      # eRob 页面右侧内容
+            ('div', 'eCoder_page_main'),     # eCoder 页面主内容
+            ('div', 'product_page_main'),    # 通用产品页面
+            ('div', 'news_text_p'),          # 新闻类布局
+        ]
+        
+        for tag, class_name in selectors:
+            content = soup.find(tag, class_=class_name)
+            if content:
+                return content
+        
+        # 如果都没找到，尝试从配置的选择器
+        return super().extract_content(soup)
+    
+    def extract_title(self, soup: BeautifulSoup, url: str) -> str:
+        """
+        提取产品页面标题
+        产品页面标题可能在不同位置
+        """
+        # 尝试从面包屑导航后的第一个 h1
+        h1_tags = soup.find_all('h1')
+        for h1 in h1_tags:
+            text = h1.get_text(strip=True)
+            # 跳过网站名称
+            if '零差云控' in text or '零误差' in text:
+                continue
+            if text:
+                return text
+        
+        # 从 URL 提取
+        return url.split('/')[-1].replace('.html', '')
+    
+    def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
+        """
+        将产品内容添加到 Word 文档
+        针对产品页面的表格等进行优化处理
+        """
+        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
+            # 跳过嵌套元素
+            if element.find_parent(['table', 'li']):
+                continue
+            
+            if element.name == 'img':
+                src = element.get('src', '')
+                if not src.startswith('http'):
+                    local_path = os.path.join(output_dir, src)
+                else:
+                    local_path = src
+                
+                if os.path.exists(local_path):
+                    try:
+                        doc.add_picture(local_path, width=Inches(4.5))
+                        doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    except Exception as e:
+                        print(f"  Word插入图片失败: {local_path} - {e}")
+            
+            elif element.name.startswith('h'):
+                text = element.get_text(strip=True)
+                if text and '零差云控' not in text:
+                    level = int(element.name[1])
+                    doc.add_heading(text, level=min(level + 1, 9))
+            
+            elif element.name == 'table':
+                # 处理表格
+                self._add_table_to_docx(doc, element)
+            
+            elif element.name == 'li':
+                text = element.get_text(strip=True)
+                if text:
+                    doc.add_paragraph(f"• {text}")
+            
+            elif element.name == 'p':
+                text = element.get_text(strip=True)
+                if text:
+                    doc.add_paragraph(text)
+            
+            elif element.name == 'div':
+                # 处理特殊的 div 内容块
+                if element.get('class') and any('param' in c for c in element.get('class', [])):
+                    text = element.get_text(strip=True)
+                    if text:
+                        doc.add_paragraph(text)
+    
+    def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
+        """
+        将 HTML 表格添加到 Word 文档
+        
+        Args:
+            doc: Document 对象
+            table_element: 表格元素
+        """
+        rows = table_element.find_all('tr')
+        if not rows:
+            return
+        
+        # 获取最大列数
+        max_cols = 0
+        for row in rows:
+            cells = row.find_all(['td', 'th'])
+            max_cols = max(max_cols, len(cells))
+        
+        if max_cols == 0:
+            return
+        
+        # 创建 Word 表格
+        try:
+            word_table = doc.add_table(rows=len(rows), cols=max_cols)
+            word_table.style = 'Table Grid'
+            
+            for i, row in enumerate(rows):
+                cells = row.find_all(['td', 'th'])
+                for j, cell in enumerate(cells):
+                    if j < max_cols:
+                        text = cell.get_text(strip=True)
+                        word_table.rows[i].cells[j].text = text
+        except Exception as e:
+            # 如果表格创建失败，降级为文本
+            print(f"  表格创建失败，降级为文本: {e}")
+            for row in rows:
+                cells = row.find_all(['td', 'th'])
+                row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
+                if row_text.strip():
+                    doc.add_paragraph(row_text)
+
--- a/zeroerr_crawler/utils.py
+++ b/zeroerr_crawler/utils.py
@@ -0,0 +1,100 @@
+"""
+工具函数模块
+提供通用的辅助功能
+"""
+
+import os
+import hashlib
+import requests
+from urllib.parse import urljoin
+from .config import HEADERS
+
+
+def ensure_dir(path: str) -> None:
+    """确保目录存在，不存在则创建"""
+    os.makedirs(path, exist_ok=True)
+
+
+def get_file_hash(url: str) -> str:
+    """根据URL生成唯一文件名哈希"""
+    return hashlib.md5(url.encode()).hexdigest()[:12]
+
+
+def get_file_extension(url: str) -> str:
+    """从URL获取文件扩展名"""
+    # 移除查询参数
+    clean_url = url.split('?')[0]
+    ext = os.path.splitext(clean_url)[1].lower()
+    if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']:
+        ext = '.jpg'  # 默认扩展名
+    return ext
+
+
+def download_image(img_url: str, save_dir: str, timeout: int = 15) -> str | None:
+    """
+    下载图片到本地
+    
+    Args:
+        img_url: 图片URL
+        save_dir: 保存目录
+        timeout: 超时时间
+    
+    Returns:
+        本地文件路径，失败返回 None
+    """
+    try:
+        ensure_dir(save_dir)
+        
+        url_hash = get_file_hash(img_url)
+        ext = get_file_extension(img_url)
+        local_filename = f"{url_hash}{ext}"
+        local_path = os.path.join(save_dir, local_filename)
+        
+        # 如果已下载过，直接返回路径
+        if os.path.exists(local_path):
+            return local_path
+        
+        # 下载图片
+        response = requests.get(img_url, headers=HEADERS, timeout=timeout)
+        if response.status_code == 200:
+            with open(local_path, 'wb') as f:
+                f.write(response.content)
+            return local_path
+        else:
+            print(f"  图片下载失败 ({response.status_code}): {img_url}")
+            return None
+    except Exception as e:
+        print(f"  图片下载出错: {img_url} - {e}")
+        return None
+
+
+def safe_filename(name: str, max_length: int = 50) -> str:
+    """
+    生成安全的文件名
+    
+    Args:
+        name: 原始名称
+        max_length: 最大长度
+    
+    Returns:
+        安全的文件名
+    """
+    # 移除或替换不安全字符
+    unsafe_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\r', '\t']
+    for char in unsafe_chars:
+        name = name.replace(char, '_')
+    
+    # 去除首尾空格
+    name = name.strip()
+    
+    # 截断长度
+    if len(name) > max_length:
+        name = name[:max_length]
+    
+    return name
+
+
+def make_absolute_url(base_url: str, relative_url: str) -> str:
+    """将相对URL转为绝对URL"""
+    return urljoin(base_url, relative_url)
+