更新爬虫方案文档，增加服务与支持-详细页面的输出信息；优化基础爬虫类，增强标题提取和内容去重逻辑；根据doc2md.py调整图片处理逻辑以改善Word文档生成效果。

2026-01-31 09:30:33 +08:00
parent 3670129972
commit 3c625d1c3a
5 changed files with 332 additions and 58 deletions
--- a/1_零差云控官网爬虫方案.md
+++ b/1_零差云控官网爬虫方案.md
@@ -27,14 +27,17 @@ crawl/
 | 常见问题 | `python main.py issue` | 32篇 | ✅ 完成 |
 | 企业新闻 | `python main.py news` | 11篇 | ✅ 完成 |
 | 认证与资质 | `python main.py certification` | 10篇 | ✅ 完成 |
-| 机器人关节 | `python main.py erob` | 11篇 | ✅ 完成 |
+| 机器人关节 | `python main.py erob` | 12篇 | ✅ 完成 |
 | 编码器 | `python main.py ecoder` | 7篇 | ✅ 完成 |
 | 配件 | `python main.py tools` | 13篇 | ✅ 完成 |
 | 关于我们 | `python main.py about` | 2篇 | ✅ 完成 |
 | 服务与支持 | `python main.py support` | 1篇 | ✅ 完成 |
 | 服务与支持-详细页面 | `python main.py service_detail` | 7篇 | ✅ 完成 |
 | 资料下载 | `python main.py download` | 1篇 | ✅ 完成 |
-**总计: 102 篇文章**
+**总计: 110 篇文章**
 > 注：服务与支持和服务与支持-详细页面合并输出到同一个目录 `output/服务与支持/`
 ## 使用方法
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,19 @@ markdownify>=0.11.0
 python-docx>=0.8.11
 lxml>=4.9.0
 # doc2md.py 依赖
 Pillow>=9.0.0
 matplotlib>=3.5.0  # 可选：用于渲染 LaTeX 公式
 # wand>=0.6.0  # 可选：用于 WMF/EMF 转换（需要系统安装 ImageMagick）
 # html2image>=2.0.0  # 可选：用于表格渲染为图片
 # test_llm.py 依赖 - RAG 方案
 openai>=1.0.0
 langchain>=0.1.0
 langchain-openai>=0.1.0
 langchain-community>=0.0.20
 faiss-cpu>=1.7.4
 tiktoken>=0.5.0
 sentence-transformers>=2.2.0
 torch>=2.0.0
--- a/zeroerr_crawler/base_crawler.py
+++ b/zeroerr_crawler/base_crawler.py
@@ -6,6 +6,7 @@
 import os
 import time
 import copy
 import re
 import requests
 from bs4 import BeautifulSoup
 import markdownify
@@ -37,8 +38,12 @@ class BaseCrawler(ABC):
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
-        # 输出目录
+        # 输出目录（支持自定义）
-        self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name))
+        if "output_dir" in task_config:
            output_dir_name = task_config["output_dir"]
        else:
            output_dir_name = self.name
        self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(output_dir_name))
        self.images_dir = os.path.join(self.output_dir, "images")
        ensure_dir(self.output_dir)
        ensure_dir(self.images_dir)
@@ -123,14 +128,41 @@ class BaseCrawler(ABC):
        selector = self.config.get("title_selector", "h1")
        index = self.config.get("title_index", 0)
        # 优先从配置的选择器提取
        tags = soup.find_all(selector)
        if tags and len(tags) > index:
-            return tags[index].get_text(strip=True)
+            title = tags[index].get_text(strip=True)
            if title:
                return title
        elif tags:
-            return tags[0].get_text(strip=True)
+            title = tags[0].get_text(strip=True)
-        else:
+            if title:
-            # 使用URL最后一部分作为标题
+                return title
-            return url.split('/')[-1].replace('.html', '')
+        
        # 尝试从页面 title 标签提取
        title_tag = soup.find('title')
        if title_tag:
            title = title_tag.get_text(strip=True)
            # 移除网站名称后缀（如 " - 零差云控"）
            if ' - ' in title:
                title = title.split(' - ')[0].strip()
            if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
                return title
        # 尝试从 h1 标签提取（即使不在配置的选择器中）
        h1_tags = soup.find_all('h1')
        for h1 in h1_tags:
            title = h1.get_text(strip=True)
            # 跳过网站名称
            if title and '零差云控' not in title and '零误差' not in title:
                return title
        # 最后使用URL最后一部分作为标题，但进行美化
        url_part = url.split('/')[-1].replace('.html', '')
        # 将连字符替换为空格，并首字母大写
        if '-' in url_part:
            url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
        return url_part
    def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
        """
@@ -155,6 +187,7 @@ class BaseCrawler(ABC):
                # class 选择器
                tag, class_name = sel.split('.', 1)
                tag = tag if tag else 'div'
                # 使用 find 只匹配第一个元素，避免重复
                content = soup.find(tag, class_=class_name)
            else:
                content = soup.find(sel)
@@ -170,9 +203,47 @@ class BaseCrawler(ABC):
        if len(all_contents) == 1:
            return all_contents[0]
        # 去重：移除嵌套或重复的内容块
        unique_contents = []
        seen_texts = set()  # 用于记录已见过的文本内容
        for content in all_contents:
            is_duplicate = False
            content_text = content.get_text(strip=True)
            # 跳过空内容
            if not content_text:
                continue
            # 检查是否被其他内容块包含（是其他块的子元素）
            for other in all_contents:
                if content is other:
                    continue
                # 检查当前内容是否是另一个内容块的子元素
                parent = content.find_parent()
                while parent:
                    if parent is other:
                        is_duplicate = True
                        break
                    parent = parent.find_parent()
                if is_duplicate:
                    break
            # 如果内容文本完全相同，只保留第一个
            if not is_duplicate and content_text in seen_texts:
                is_duplicate = True
            if not is_duplicate:
                unique_contents.append(content)
                seen_texts.add(content_text)
        # 如果去重后只剩一个，直接返回
        if len(unique_contents) == 1:
            return unique_contents[0]
        # 合并多个内容区域到一个容器
        combined = soup.new_tag('div')
-        for content in all_contents:
+        for content in unique_contents:
            # 深拷贝内容以避免从原DOM中移除
            combined.append(copy.deepcopy(content))
@@ -192,6 +263,35 @@ class BaseCrawler(ABC):
        for tag in content(['script', 'style']):
            tag.decompose()
        # 移除导航链接、空链接、锚点链接
        for a in content.find_all('a', href=True):
            href = a.get('href', '')
            # 移除空链接、锚点链接、JavaScript 链接
            if not href or href.startswith('#') or href.startswith('javascript:'):
                # 保留链接文本，移除链接标签
                a.unwrap()
            # 移除导航菜单中的链接（通常包含特定 class 或 id）
            elif a.find_parent(['nav', 'menu', 'navigation']):
                a.decompose()
        # 移除空的 div、span 等标签（只包含空白字符）
        for tag in content.find_all(['div', 'span', 'p']):
            text = tag.get_text(strip=True)
            if not text and not tag.find_all(['img', 'table']):
                # 如果没有文本内容且没有图片/表格，移除
                tag.decompose()
        # 移除注释
        from bs4 import Comment
        for comment in content.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        # 清理多余的空白字符
        for tag in content.find_all(['p', 'div', 'span']):
            if tag.string:
                # 清理段落内的多余空白
                tag.string = ' '.join(tag.string.split())
        return content
    def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
@@ -249,7 +349,8 @@ class BaseCrawler(ABC):
            content: 内容区域
            output_dir: 输出目录（用于解析图片路径）
        """
-        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']):
+        # 按文档顺序处理元素，保持列表的连续性
        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']):
            if element.name == 'img':
                src = element.get('src', '')
                # 尝试获取本地图片路径
@@ -260,30 +361,59 @@ class BaseCrawler(ABC):
                if os.path.exists(local_path):
                    try:
                        # 图片前添加空行
                        doc.add_paragraph()
                        doc.add_picture(local_path, width=Inches(5))
                        doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
                        # 图片后添加空行
                        doc.add_paragraph()
                    except Exception as e:
                        print(f"  Word插入图片失败: {local_path} - {e}")
            elif element.name.startswith('h'):
                text = element.get_text(strip=True)
                if text:
                    # HTML h1-h6 直接映射到 Word Heading 1-6
                    # 限制在 1-9 范围内（Word 支持的最大标题级别）
                    level = int(element.name[1])
-                    doc.add_heading(text, level=min(level + 1, 9))
+                    doc.add_heading(text, level=min(level, 9))
            elif element.name in ['ul', 'ol']:
                # 列表容器，跳过（列表项会单独处理）
                continue
            elif element.name == 'li':
                text = element.get_text(strip=True)
                if text:
-                    doc.add_paragraph(text, style='List Bullet')
+                    # 检查父元素是 ul 还是 ol
                    parent = element.find_parent(['ul', 'ol'])
                    is_ordered = parent and parent.name == 'ol'
                    # 使用列表样式
                    if is_ordered:
                        doc.add_paragraph(text, style='List Number')
                    else:
                        doc.add_paragraph(text, style='List Bullet')
            elif element.name == 'table':
-                # 简单处理表格，提取文本
+                # 处理表格，创建 Word 表格结构（便于 doc2md.py 解析）
-                for row in element.find_all('tr'):
+                self._add_table_to_docx(doc, element)
-                    cells = row.find_all(['td', 'th'])
+                
-                    row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
+            elif element.name == 'p':
                    if row_text.strip():
                        doc.add_paragraph(row_text)
            else:
                text = element.get_text(strip=True)
                if text:
-                    doc.add_paragraph(text)
+                    # 跳过空段落和只包含空白字符的段落
                    if text.strip():
                        # 检查是否是列表项（某些网站用 p 标签包裹列表项）
                        parent = element.find_parent(['ul', 'ol'])
                        if parent:
                            is_ordered = parent.name == 'ol'
                            if is_ordered:
                                doc.add_paragraph(text, style='List Number')
                            else:
                                doc.add_paragraph(text, style='List Bullet')
                        else:
                            doc.add_paragraph(text)
    def crawl_page(self, url: str) -> dict | None:
        """
@@ -357,6 +487,7 @@ class BaseCrawler(ABC):
    def save_combined_documents(self, all_pages: list[dict]):
        """
        将所有页面汇总保存为一个 md 和 docx 文件
        如果文件已存在，会追加内容并去重（基于URL）
        Args:
            all_pages: 所有页面数据列表
@@ -364,47 +495,94 @@ class BaseCrawler(ABC):
        if not all_pages:
            return
-        safe_name = safe_filename(self.name)
+        # 确定汇总文件名（使用输出目录名，而不是任务名）
-        
+        output_dir_name = os.path.basename(self.output_dir)
-        # === 生成汇总 Markdown ===
+        safe_name = safe_filename(output_dir_name)
        combined_md = f"# {self.name}全集\n\n"
        combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
        combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容，共 {len(all_pages)} 篇。\n\n"
        combined_md += "---\n\n"
        # 添加每篇内容
        for page in all_pages:
            combined_md += f"## {page['title']}\n\n"
            combined_md += f"**原文链接**: {page['url']}\n\n"
            combined_md += page["markdown"]
            combined_md += "\n\n---\n\n"
        md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
        docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
        # === 处理 Markdown ===
        existing_urls = set()
        existing_content = ""
        # 如果文件已存在，读取现有内容并提取已存在的URL
        if os.path.exists(md_path):
            with open(md_path, "r", encoding="utf-8") as f:
                existing_content = f.read()
                # 提取已存在的URL（用于去重）
                url_pattern = r'\*\*原文链接\*\*: (https?://[^\s\n]+)'
                existing_urls = set(re.findall(url_pattern, existing_content))
        # 过滤掉已存在的页面（基于URL去重）
        new_pages = [page for page in all_pages if page['url'] not in existing_urls]
        if not new_pages and existing_content:
            print(f"  所有页面已存在，无需更新: {md_path}")
            return
        # 生成新内容
        new_md_content = ""
        for page in new_pages:
            new_md_content += f"## {page['title']}\n\n"
            new_md_content += f"**原文链接**: {page['url']}\n\n"
            new_md_content += page["markdown"]
            new_md_content += "\n\n---\n\n"
        # 追加或创建文件
        if existing_content:
            # 追加模式：在现有内容后追加新内容
            combined_md = existing_content.rstrip() + "\n\n" + new_md_content
            print(f"  追加 {len(new_pages)} 篇新内容到现有文档")
        else:
            # 新建模式：创建新文档
            combined_md = f"# {output_dir_name}全集\n\n" + new_md_content
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(combined_md)
        print(f"  汇总 Markdown: {md_path}")
-        # === 生成汇总 Word 文档 ===
+        # === 处理 Word 文档 ===
-        doc = Document()
+        if os.path.exists(docx_path):
-        doc.add_heading(f'{self.name}全集', 0)
+            # 如果Word文档已存在，重新生成（因为python-docx不支持追加）
-        
+            doc = Document(docx_path)
-        intro = doc.add_paragraph()
+            # 提取已存在的URL
-        intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True
+            existing_doc_urls = set()
-        doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容，共 {len(all_pages)} 篇。")
+            for para in doc.paragraphs:
-        doc.add_page_break()
+                if para.runs and "原文链接:" in para.text:
-        
+                    url_match = re.search(r'原文链接: (https?://[^\s\n]+)', para.text)
-        # 添加每篇内容
+                    if url_match:
-        for page in all_pages:
+                        existing_doc_urls.add(url_match.group(1))
            doc.add_heading(page["title"], level=1)
            p = doc.add_paragraph()
            p.add_run(f"原文链接: {page['url']}").italic = True
-            self.add_content_to_docx(doc, page["content"], self.output_dir)
+            # 过滤新页面
-            doc.add_page_break()
+            new_pages_for_doc = [page for page in all_pages if page['url'] not in existing_doc_urls]
-        
+            
-        docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
+            if new_pages_for_doc:
-        doc.save(docx_path)
+                # 添加新内容
-        print(f"  汇总 Word: {docx_path}")
+                for page in new_pages_for_doc:
                    doc.add_heading(page["title"], level=1)
                    p = doc.add_paragraph()
                    p.add_run(f"原文链接: {page['url']}").italic = True
                    self.add_content_to_docx(doc, page["content"], self.output_dir)
                    doc.add_page_break()
                doc.save(docx_path)
                print(f"  追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档")
            else:
                print(f"  Word 文档无需更新: {docx_path}")
        else:
            # 新建Word文档
            doc = Document()
            doc.add_heading(f'{output_dir_name}全集', level=1)
            for page in all_pages:
                doc.add_heading(page["title"], level=1)
                p = doc.add_paragraph()
                p.add_run(f"原文链接: {page['url']}").italic = True
                self.add_content_to_docx(doc, page["content"], self.output_dir)
                doc.add_page_break()
            doc.save(docx_path)
            print(f"  汇总 Word: {docx_path}")
    def run(self):
        """
@@ -451,6 +629,67 @@ class BaseCrawler(ABC):
        print(f"输出目录: {self.output_dir}")
    def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
        """
        将 HTML 表格添加到 Word 文档
        Args:
            doc: Document 对象
            table_element: 表格元素
        """
        rows = table_element.find_all('tr')
        if not rows:
            return
        # 获取最大列数（考虑 colspan）
        max_cols = 0
        for row in rows:
            cells = row.find_all(['td', 'th'])
            col_count = 0
            for cell in cells:
                colspan = int(cell.get('colspan', 1))
                col_count += colspan
            max_cols = max(max_cols, col_count)
        if max_cols == 0:
            return
        # 创建 Word 表格
        try:
            word_table = doc.add_table(rows=len(rows), cols=max_cols)
            word_table.style = 'Table Grid'
            for i, row in enumerate(rows):
                cells = row.find_all(['td', 'th'])
                col_idx = 0
                for cell in cells:
                    if col_idx >= max_cols:
                        break
                    text = cell.get_text(strip=True)
                    colspan = int(cell.get('colspan', 1))
                    rowspan = int(cell.get('rowspan', 1))
                    # 设置单元格文本
                    word_table.rows[i].cells[col_idx].text = text
                    # 处理合并单元格（python-docx 的合并需要特殊处理）
                    # 注意：python-docx 的合并单元格功能有限，这里先简单处理
                    if colspan > 1 or rowspan > 1:
                        # 对于合并单元格，python-docx 需要手动合并
                        # 这里先标记，后续可以改进
                        pass
                    col_idx += colspan
        except Exception as e:
            # 如果表格创建失败，降级为文本
            print(f"  表格创建失败，降级为文本: {e}")
            for row in rows:
                cells = row.find_all(['td', 'th'])
                row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
                if row_text.strip():
                    doc.add_paragraph(row_text)
 class StandardCrawler(BaseCrawler):
    """
    标准爬虫类
--- a/zeroerr_crawler/config.py
+++ b/zeroerr_crawler/config.py
@@ -102,8 +102,6 @@ CRAWL_TASKS = {
        "static_pages": [
            "/about/about-us.html",
            "/about/contact-us.html",
            "/about/join-us.html",
            "/about/152.html",  # 诚招代理
        ],
        "content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact",  # 多区域布局
        "title_selector": "h1,h2",
@@ -129,5 +127,17 @@ CRAWL_TASKS = {
        "title_selector": "h1",
        "title_index": 0,
    },
    # 服务与支持详细页面（从索引页提取）
    "service_detail": {
        "name": "服务与支持-详细页面",
        "output_dir": "服务与支持",  # 输出到同一个目录
        "index_url": "/Service/index.html",
        "link_pattern": "/Service/",
        "link_suffix": ".html",
        "exclude_patterns": ["index.html"],
        "content_selector": "div.news_text_p,div.news_text,div.content,div.content-section",  # 多种布局支持
        "title_selector": "h1",
        "title_index": 1,
    },
 }
--- a/zeroerr_crawler/product_crawler.py
+++ b/zeroerr_crawler/product_crawler.py
@@ -77,16 +77,22 @@ class ProductCrawler(BaseCrawler):
                if os.path.exists(local_path):
                    try:
                        # 图片前添加空行
                        doc.add_paragraph()
                        doc.add_picture(local_path, width=Inches(4.5))
                        doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
                        # 图片后添加空行
                        doc.add_paragraph()
                    except Exception as e:
                        print(f"  Word插入图片失败: {local_path} - {e}")
            elif element.name.startswith('h'):
                text = element.get_text(strip=True)
                if text and '零差云控' not in text:
                    # HTML h1-h6 直接映射到 Word Heading 1-6
                    # 限制在 1-9 范围内（Word 支持的最大标题级别）
                    level = int(element.name[1])
-                    doc.add_heading(text, level=min(level + 1, 9))
+                    doc.add_heading(text, level=min(level, 9))
            elif element.name == 'table':
                # 处理表格