diff --git a/1_零差云控官网爬虫方案.md b/1_零差云控官网爬虫方案.md index 2e17d72..ecf0e9a 100644 --- a/1_零差云控官网爬虫方案.md +++ b/1_零差云控官网爬虫方案.md @@ -27,14 +27,17 @@ crawl/ | 常见问题 | `python main.py issue` | 32篇 | ✅ 完成 | | 企业新闻 | `python main.py news` | 11篇 | ✅ 完成 | | 认证与资质 | `python main.py certification` | 10篇 | ✅ 完成 | -| 机器人关节 | `python main.py erob` | 11篇 | ✅ 完成 | +| 机器人关节 | `python main.py erob` | 12篇 | ✅ 完成 | | 编码器 | `python main.py ecoder` | 7篇 | ✅ 完成 | | 配件 | `python main.py tools` | 13篇 | ✅ 完成 | | 关于我们 | `python main.py about` | 2篇 | ✅ 完成 | | 服务与支持 | `python main.py support` | 1篇 | ✅ 完成 | +| 服务与支持-详细页面 | `python main.py service_detail` | 7篇 | ✅ 完成 | | 资料下载 | `python main.py download` | 1篇 | ✅ 完成 | -**总计: 102 篇文章** +**总计: 110 篇文章** + +> 注:服务与支持和服务与支持-详细页面合并输出到同一个目录 `output/服务与支持/` ## 使用方法 diff --git a/requirements.txt b/requirements.txt index 535f732..728d4cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,19 @@ markdownify>=0.11.0 python-docx>=0.8.11 lxml>=4.9.0 +# doc2md.py 依赖 +Pillow>=9.0.0 +matplotlib>=3.5.0 # 可选:用于渲染 LaTeX 公式 +# wand>=0.6.0 # 可选:用于 WMF/EMF 转换(需要系统安装 ImageMagick) +# html2image>=2.0.0 # 可选:用于表格渲染为图片 + +# test_llm.py 依赖 - RAG 方案 +openai>=1.0.0 +langchain>=0.1.0 +langchain-openai>=0.1.0 +langchain-community>=0.0.20 +faiss-cpu>=1.7.4 +tiktoken>=0.5.0 +sentence-transformers>=2.2.0 +torch>=2.0.0 + diff --git a/zeroerr_crawler/base_crawler.py b/zeroerr_crawler/base_crawler.py index 983fc6b..b844c12 100644 --- a/zeroerr_crawler/base_crawler.py +++ b/zeroerr_crawler/base_crawler.py @@ -6,6 +6,7 @@ import os import time import copy +import re import requests from bs4 import BeautifulSoup import markdownify @@ -37,8 +38,12 @@ class BaseCrawler(ABC): self.session = requests.Session() self.session.headers.update(HEADERS) - # 输出目录 - self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name)) + # 输出目录(支持自定义) + if "output_dir" in task_config: + output_dir_name = task_config["output_dir"] + else: + output_dir_name = self.name + self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(output_dir_name)) self.images_dir = os.path.join(self.output_dir, "images") ensure_dir(self.output_dir) ensure_dir(self.images_dir) @@ -123,14 +128,41 @@ class BaseCrawler(ABC): selector = self.config.get("title_selector", "h1") index = self.config.get("title_index", 0) + # 优先从配置的选择器提取 tags = soup.find_all(selector) if tags and len(tags) > index: - return tags[index].get_text(strip=True) + title = tags[index].get_text(strip=True) + if title: + return title elif tags: - return tags[0].get_text(strip=True) - else: - # 使用URL最后一部分作为标题 - return url.split('/')[-1].replace('.html', '') + title = tags[0].get_text(strip=True) + if title: + return title + + # 尝试从页面 title 标签提取 + title_tag = soup.find('title') + if title_tag: + title = title_tag.get_text(strip=True) + # 移除网站名称后缀(如 " - 零差云控") + if ' - ' in title: + title = title.split(' - ')[0].strip() + if title and title.lower() not in ['about-us', 'contact-us', 'join-us']: + return title + + # 尝试从 h1 标签提取(即使不在配置的选择器中) + h1_tags = soup.find_all('h1') + for h1 in h1_tags: + title = h1.get_text(strip=True) + # 跳过网站名称 + if title and '零差云控' not in title and '零误差' not in title: + return title + + # 最后使用URL最后一部分作为标题,但进行美化 + url_part = url.split('/')[-1].replace('.html', '') + # 将连字符替换为空格,并首字母大写 + if '-' in url_part: + url_part = ' '.join(word.capitalize() for word in url_part.split('-')) + return url_part def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None: """ @@ -155,6 +187,7 @@ class BaseCrawler(ABC): # class 选择器 tag, class_name = sel.split('.', 1) tag = tag if tag else 'div' + # 使用 find 只匹配第一个元素,避免重复 content = soup.find(tag, class_=class_name) else: content = soup.find(sel) @@ -170,9 +203,47 @@ class BaseCrawler(ABC): if len(all_contents) == 1: return all_contents[0] + # 去重:移除嵌套或重复的内容块 + unique_contents = [] + seen_texts = set() # 用于记录已见过的文本内容 + + for content in all_contents: + is_duplicate = False + content_text = content.get_text(strip=True) + + # 跳过空内容 + if not content_text: + continue + + # 检查是否被其他内容块包含(是其他块的子元素) + for other in all_contents: + if content is other: + continue + # 检查当前内容是否是另一个内容块的子元素 + parent = content.find_parent() + while parent: + if parent is other: + is_duplicate = True + break + parent = parent.find_parent() + if is_duplicate: + break + + # 如果内容文本完全相同,只保留第一个 + if not is_duplicate and content_text in seen_texts: + is_duplicate = True + + if not is_duplicate: + unique_contents.append(content) + seen_texts.add(content_text) + + # 如果去重后只剩一个,直接返回 + if len(unique_contents) == 1: + return unique_contents[0] + # 合并多个内容区域到一个容器 combined = soup.new_tag('div') - for content in all_contents: + for content in unique_contents: # 深拷贝内容以避免从原DOM中移除 combined.append(copy.deepcopy(content)) @@ -192,6 +263,35 @@ class BaseCrawler(ABC): for tag in content(['script', 'style']): tag.decompose() + # 移除导航链接、空链接、锚点链接 + for a in content.find_all('a', href=True): + href = a.get('href', '') + # 移除空链接、锚点链接、JavaScript 链接 + if not href or href.startswith('#') or href.startswith('javascript:'): + # 保留链接文本,移除链接标签 + a.unwrap() + # 移除导航菜单中的链接(通常包含特定 class 或 id) + elif a.find_parent(['nav', 'menu', 'navigation']): + a.decompose() + + # 移除空的 div、span 等标签(只包含空白字符) + for tag in content.find_all(['div', 'span', 'p']): + text = tag.get_text(strip=True) + if not text and not tag.find_all(['img', 'table']): + # 如果没有文本内容且没有图片/表格,移除 + tag.decompose() + + # 移除注释 + from bs4 import Comment + for comment in content.find_all(string=lambda text: isinstance(text, Comment)): + comment.extract() + + # 清理多余的空白字符 + for tag in content.find_all(['p', 'div', 'span']): + if tag.string: + # 清理段落内的多余空白 + tag.string = ' '.join(tag.string.split()) + return content def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]: @@ -249,7 +349,8 @@ class BaseCrawler(ABC): content: 内容区域 output_dir: 输出目录(用于解析图片路径) """ - for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']): + # 按文档顺序处理元素,保持列表的连续性 + for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']): if element.name == 'img': src = element.get('src', '') # 尝试获取本地图片路径 @@ -260,30 +361,59 @@ class BaseCrawler(ABC): if os.path.exists(local_path): try: + # 图片前添加空行 + doc.add_paragraph() doc.add_picture(local_path, width=Inches(5)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER + # 图片后添加空行 + doc.add_paragraph() except Exception as e: print(f" Word插入图片失败: {local_path} - {e}") + elif element.name.startswith('h'): text = element.get_text(strip=True) if text: + # HTML h1-h6 直接映射到 Word Heading 1-6 + # 限制在 1-9 范围内(Word 支持的最大标题级别) level = int(element.name[1]) - doc.add_heading(text, level=min(level + 1, 9)) + doc.add_heading(text, level=min(level, 9)) + + elif element.name in ['ul', 'ol']: + # 列表容器,跳过(列表项会单独处理) + continue + elif element.name == 'li': text = element.get_text(strip=True) if text: - doc.add_paragraph(text, style='List Bullet') + # 检查父元素是 ul 还是 ol + parent = element.find_parent(['ul', 'ol']) + is_ordered = parent and parent.name == 'ol' + + # 使用列表样式 + if is_ordered: + doc.add_paragraph(text, style='List Number') + else: + doc.add_paragraph(text, style='List Bullet') + elif element.name == 'table': - # 简单处理表格,提取文本 - for row in element.find_all('tr'): - cells = row.find_all(['td', 'th']) - row_text = ' | '.join([cell.get_text(strip=True) for cell in cells]) - if row_text.strip(): - doc.add_paragraph(row_text) - else: + # 处理表格,创建 Word 表格结构(便于 doc2md.py 解析) + self._add_table_to_docx(doc, element) + + elif element.name == 'p': text = element.get_text(strip=True) if text: - doc.add_paragraph(text) + # 跳过空段落和只包含空白字符的段落 + if text.strip(): + # 检查是否是列表项(某些网站用 p 标签包裹列表项) + parent = element.find_parent(['ul', 'ol']) + if parent: + is_ordered = parent.name == 'ol' + if is_ordered: + doc.add_paragraph(text, style='List Number') + else: + doc.add_paragraph(text, style='List Bullet') + else: + doc.add_paragraph(text) def crawl_page(self, url: str) -> dict | None: """ @@ -357,6 +487,7 @@ class BaseCrawler(ABC): def save_combined_documents(self, all_pages: list[dict]): """ 将所有页面汇总保存为一个 md 和 docx 文件 + 如果文件已存在,会追加内容并去重(基于URL) Args: all_pages: 所有页面数据列表 @@ -364,47 +495,94 @@ class BaseCrawler(ABC): if not all_pages: return - safe_name = safe_filename(self.name) - - # === 生成汇总 Markdown === - combined_md = f"# {self.name}全集\n\n" - combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n" - combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。\n\n" - combined_md += "---\n\n" - - # 添加每篇内容 - for page in all_pages: - combined_md += f"## {page['title']}\n\n" - combined_md += f"**原文链接**: {page['url']}\n\n" - combined_md += page["markdown"] - combined_md += "\n\n---\n\n" + # 确定汇总文件名(使用输出目录名,而不是任务名) + output_dir_name = os.path.basename(self.output_dir) + safe_name = safe_filename(output_dir_name) md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md") + docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx") + + # === 处理 Markdown === + existing_urls = set() + existing_content = "" + + # 如果文件已存在,读取现有内容并提取已存在的URL + if os.path.exists(md_path): + with open(md_path, "r", encoding="utf-8") as f: + existing_content = f.read() + # 提取已存在的URL(用于去重) + url_pattern = r'\*\*原文链接\*\*: (https?://[^\s\n]+)' + existing_urls = set(re.findall(url_pattern, existing_content)) + + # 过滤掉已存在的页面(基于URL去重) + new_pages = [page for page in all_pages if page['url'] not in existing_urls] + + if not new_pages and existing_content: + print(f" 所有页面已存在,无需更新: {md_path}") + return + + # 生成新内容 + new_md_content = "" + for page in new_pages: + new_md_content += f"## {page['title']}\n\n" + new_md_content += f"**原文链接**: {page['url']}\n\n" + new_md_content += page["markdown"] + new_md_content += "\n\n---\n\n" + + # 追加或创建文件 + if existing_content: + # 追加模式:在现有内容后追加新内容 + combined_md = existing_content.rstrip() + "\n\n" + new_md_content + print(f" 追加 {len(new_pages)} 篇新内容到现有文档") + else: + # 新建模式:创建新文档 + combined_md = f"# {output_dir_name}全集\n\n" + new_md_content + with open(md_path, "w", encoding="utf-8") as f: f.write(combined_md) print(f" 汇总 Markdown: {md_path}") - # === 生成汇总 Word 文档 === - doc = Document() - doc.add_heading(f'{self.name}全集', 0) - - intro = doc.add_paragraph() - intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True - doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。") - doc.add_page_break() - - # 添加每篇内容 - for page in all_pages: - doc.add_heading(page["title"], level=1) - p = doc.add_paragraph() - p.add_run(f"原文链接: {page['url']}").italic = True + # === 处理 Word 文档 === + if os.path.exists(docx_path): + # 如果Word文档已存在,重新生成(因为python-docx不支持追加) + doc = Document(docx_path) + # 提取已存在的URL + existing_doc_urls = set() + for para in doc.paragraphs: + if para.runs and "原文链接:" in para.text: + url_match = re.search(r'原文链接: (https?://[^\s\n]+)', para.text) + if url_match: + existing_doc_urls.add(url_match.group(1)) - self.add_content_to_docx(doc, page["content"], self.output_dir) - doc.add_page_break() - - docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx") - doc.save(docx_path) - print(f" 汇总 Word: {docx_path}") + # 过滤新页面 + new_pages_for_doc = [page for page in all_pages if page['url'] not in existing_doc_urls] + + if new_pages_for_doc: + # 添加新内容 + for page in new_pages_for_doc: + doc.add_heading(page["title"], level=1) + p = doc.add_paragraph() + p.add_run(f"原文链接: {page['url']}").italic = True + self.add_content_to_docx(doc, page["content"], self.output_dir) + doc.add_page_break() + doc.save(docx_path) + print(f" 追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档") + else: + print(f" Word 文档无需更新: {docx_path}") + else: + # 新建Word文档 + doc = Document() + doc.add_heading(f'{output_dir_name}全集', level=1) + + for page in all_pages: + doc.add_heading(page["title"], level=1) + p = doc.add_paragraph() + p.add_run(f"原文链接: {page['url']}").italic = True + self.add_content_to_docx(doc, page["content"], self.output_dir) + doc.add_page_break() + + doc.save(docx_path) + print(f" 汇总 Word: {docx_path}") def run(self): """ @@ -451,6 +629,67 @@ class BaseCrawler(ABC): print(f"输出目录: {self.output_dir}") + def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup): + """ + 将 HTML 表格添加到 Word 文档 + + Args: + doc: Document 对象 + table_element: 表格元素 + """ + rows = table_element.find_all('tr') + if not rows: + return + + # 获取最大列数(考虑 colspan) + max_cols = 0 + for row in rows: + cells = row.find_all(['td', 'th']) + col_count = 0 + for cell in cells: + colspan = int(cell.get('colspan', 1)) + col_count += colspan + max_cols = max(max_cols, col_count) + + if max_cols == 0: + return + + # 创建 Word 表格 + try: + word_table = doc.add_table(rows=len(rows), cols=max_cols) + word_table.style = 'Table Grid' + + for i, row in enumerate(rows): + cells = row.find_all(['td', 'th']) + col_idx = 0 + for cell in cells: + if col_idx >= max_cols: + break + text = cell.get_text(strip=True) + colspan = int(cell.get('colspan', 1)) + rowspan = int(cell.get('rowspan', 1)) + + # 设置单元格文本 + word_table.rows[i].cells[col_idx].text = text + + # 处理合并单元格(python-docx 的合并需要特殊处理) + # 注意:python-docx 的合并单元格功能有限,这里先简单处理 + if colspan > 1 or rowspan > 1: + # 对于合并单元格,python-docx 需要手动合并 + # 这里先标记,后续可以改进 + pass + + col_idx += colspan + except Exception as e: + # 如果表格创建失败,降级为文本 + print(f" 表格创建失败,降级为文本: {e}") + for row in rows: + cells = row.find_all(['td', 'th']) + row_text = ' | '.join([cell.get_text(strip=True) for cell in cells]) + if row_text.strip(): + doc.add_paragraph(row_text) + + class StandardCrawler(BaseCrawler): """ 标准爬虫类 diff --git a/zeroerr_crawler/config.py b/zeroerr_crawler/config.py index ab3c47d..66c48dd 100644 --- a/zeroerr_crawler/config.py +++ b/zeroerr_crawler/config.py @@ -102,8 +102,6 @@ CRAWL_TASKS = { "static_pages": [ "/about/about-us.html", "/about/contact-us.html", - "/about/join-us.html", - "/about/152.html", # 诚招代理 ], "content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局 "title_selector": "h1,h2", @@ -129,5 +127,17 @@ CRAWL_TASKS = { "title_selector": "h1", "title_index": 0, }, + # 服务与支持详细页面(从索引页提取) + "service_detail": { + "name": "服务与支持-详细页面", + "output_dir": "服务与支持", # 输出到同一个目录 + "index_url": "/Service/index.html", + "link_pattern": "/Service/", + "link_suffix": ".html", + "exclude_patterns": ["index.html"], + "content_selector": "div.news_text_p,div.news_text,div.content,div.content-section", # 多种布局支持 + "title_selector": "h1", + "title_index": 1, + }, } diff --git a/zeroerr_crawler/product_crawler.py b/zeroerr_crawler/product_crawler.py index ee70e57..3518b65 100644 --- a/zeroerr_crawler/product_crawler.py +++ b/zeroerr_crawler/product_crawler.py @@ -77,16 +77,22 @@ class ProductCrawler(BaseCrawler): if os.path.exists(local_path): try: + # 图片前添加空行 + doc.add_paragraph() doc.add_picture(local_path, width=Inches(4.5)) doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER + # 图片后添加空行 + doc.add_paragraph() except Exception as e: print(f" Word插入图片失败: {local_path} - {e}") elif element.name.startswith('h'): text = element.get_text(strip=True) if text and '零差云控' not in text: + # HTML h1-h6 直接映射到 Word Heading 1-6 + # 限制在 1-9 范围内(Word 支持的最大标题级别) level = int(element.name[1]) - doc.add_heading(text, level=min(level + 1, 9)) + doc.add_heading(text, level=min(level, 9)) elif element.name == 'table': # 处理表格