diff --git a/requirements.txt b/requirements.txt index 728d4cf..96fb1ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # 零差云控官网爬虫依赖 requests>=2.28.0 beautifulsoup4>=4.11.0 +markdown>=3.6 markdownify>=0.11.0 python-docx>=0.8.11 lxml>=4.9.0 diff --git a/zeroerr_crawler/base_crawler.py b/zeroerr_crawler/base_crawler.py index 76e5bd9..8152364 100644 --- a/zeroerr_crawler/base_crawler.py +++ b/zeroerr_crawler/base_crawler.py @@ -8,19 +8,20 @@ import time import copy import re import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString, Tag import markdownify from docx import Document from docx.shared import Inches, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement from docx.oxml.ns import qn +from docx.opc.constants import RELATIONSHIP_TYPE as RT from urllib.parse import urljoin from abc import ABC, abstractmethod from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR from .utils import ensure_dir, download_image, safe_filename, make_absolute_url from .extract_abstract import generate_abstract -from .post_process import post_process_docx_headings def _new_doc() -> Document: @@ -354,6 +355,88 @@ class BaseCrawler(ABC): img['src'] = full_url return images_info + + def process_links(self, content: BeautifulSoup, page_url: str): + """ + 处理内容中的链接,将相对链接转换为绝对链接 + + Args: + content: 内容区域 + page_url: 页面URL(用于解析相对路径) + """ + for a in content.find_all('a', href=True): + href = a.get('href', '').strip() + if not href: + continue + # 保留电话/邮箱/锚点等特殊链接 + if href.startswith(('mailto:', 'tel:', '#', 'javascript:')): + continue + a['href'] = make_absolute_url(page_url, href) + + def _add_hyperlink(self, paragraph, text: str, url: str): + """向段落添加可点击的超链接。""" + if not text or not url: + return + part = paragraph.part + r_id = part.relate_to(url, RT.HYPERLINK, is_external=True) + hyperlink = OxmlElement('w:hyperlink') + hyperlink.set(qn('r:id'), r_id) + + run = OxmlElement('w:r') + r_pr = OxmlElement('w:rPr') + r_style = OxmlElement('w:rStyle') + r_style.set(qn('w:val'), 'Hyperlink') + r_pr.append(r_style) + run.append(r_pr) + + text_element = OxmlElement('w:t') + text_element.text = text + run.append(text_element) + hyperlink.append(run) + paragraph._p.append(hyperlink) + + def _append_inline_nodes_to_paragraph(self, paragraph, node): + """递归写入段落内联内容,保留 a 标签为超链接。""" + if isinstance(node, NavigableString): + text = re.sub(r'\s+', ' ', str(node)) + if text: + paragraph.add_run(text) + return + if not isinstance(node, Tag): + return + if node.name == 'br': + paragraph.add_run('\n') + return + if node.name == 'a': + link_text = node.get_text(' ', strip=True) + href = (node.get('href') or '').strip() + if link_text: + display_text = f"{link_text} ({href})" if href else link_text + if paragraph.text.strip(): + paragraph.add_run(" ") + if href: + self._add_hyperlink(paragraph, display_text, href) + else: + paragraph.add_run(display_text) + return + for child in node.children: + self._append_inline_nodes_to_paragraph(paragraph, child) + + def _add_paragraph_with_links(self, doc: Document, element: Tag, style: str | None = None, prefix: str = ""): + """添加段落并保留其中超链接。""" + if not element.get_text(strip=True): + return + if style: + paragraph = doc.add_paragraph(style=style) + else: + paragraph = doc.add_paragraph() + if prefix: + paragraph.add_run(prefix) + if element.find('a'): + for child in element.children: + self._append_inline_nodes_to_paragraph(paragraph, child) + else: + paragraph.add_run(element.get_text(strip=True)) def content_to_markdown(self, content: BeautifulSoup, page_title: str = None) -> str: """ @@ -397,10 +480,18 @@ class BaseCrawler(ABC): h2.decompose() break # 只移除第一个匹配的 - # 页面内容中的 h1 降级为 h2(与 Word 文档处理一致) - # 因为页面标题已经是二级标题(##),所以内容中的 h1 应该降级为二级标题 - for h1 in content_copy.find_all('h1'): - h1.name = 'h2' + # 正文标题统一映射:每页正文从 h3 起步,并压缩为连续层级(不跳级) + # 例如: + # - 若正文有 h2/h4:映射为 h3/h4(而不是 h3/h5) + # - 若正文有 h1/h3/h6:映射为 h3/h4/h5 + body_headings = content_copy.find_all(re.compile(r'^h[1-6]$')) + if body_headings: + unique_levels = sorted({int(h.name[1]) for h in body_headings}) + level_map = {level: min(i + 3, 6) for i, level in enumerate(unique_levels)} + for heading in body_headings: + original_level = int(heading.name[1]) + new_level = level_map[original_level] + heading.name = f'h{new_level}' return markdownify.markdownify(str(content_copy), heading_style="ATX") @@ -423,6 +514,11 @@ class BaseCrawler(ABC): if h1_text == page_title: first_h1.decompose() # 移除该标签 + # 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级) + heading_elements = content.find_all(re.compile(r'^h[1-6]$')) + unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else [] + level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)} + # 按文档顺序处理元素,保持列表的连续性 for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']): if element.name == 'img': @@ -447,16 +543,11 @@ class BaseCrawler(ABC): elif element.name.startswith('h'): text = element.get_text(strip=True) if text: - # 对于页面内容中的标题,h1 转换为 Heading 2,h2-h6 保持原层级 - # 因为页面标题已经是 Heading 1,所以内容中的 h1 应该降级为 Heading 2 + # 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级) original_level = int(element.name[1]) - if original_level == 1: - # 页面内容中的 h1 转换为 Heading 2 - word_level = 2 - print(f" 标题层级转换: h1 '{text}' → Heading 2") - else: - # h2-h6 保持原层级(h2→Heading 2, h3→Heading 3, ...) - word_level = original_level + word_level = level_map.get(original_level, 3) + if word_level != original_level: + print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}") doc.add_heading(text, level=min(word_level, 9)) elif element.name in ['ul', 'ol']: @@ -464,37 +555,29 @@ class BaseCrawler(ABC): continue elif element.name == 'li': - text = element.get_text(strip=True) - if text: - # 检查父元素是 ul 还是 ol - parent = element.find_parent(['ul', 'ol']) - is_ordered = parent and parent.name == 'ol' - - # 使用列表样式 - if is_ordered: - doc.add_paragraph(text, style='List Number') - else: - doc.add_paragraph(text, style='List Bullet') + # 检查父元素是 ul 还是 ol + parent = element.find_parent(['ul', 'ol']) + is_ordered = parent and parent.name == 'ol' + if is_ordered: + self._add_paragraph_with_links(doc, element, style='List Number') + else: + self._add_paragraph_with_links(doc, element, style='List Bullet') elif element.name == 'table': # 处理表格,创建 Word 表格结构(便于 doc2md.py 解析) self._add_table_to_docx(doc, element) elif element.name == 'p': - text = element.get_text(strip=True) - if text: - # 跳过空段落和只包含空白字符的段落 - if text.strip(): - # 检查是否是列表项(某些网站用 p 标签包裹列表项) - parent = element.find_parent(['ul', 'ol']) - if parent: - is_ordered = parent.name == 'ol' - if is_ordered: - doc.add_paragraph(text, style='List Number') - else: - doc.add_paragraph(text, style='List Bullet') - else: - doc.add_paragraph(text) + # 检查是否是列表项(某些网站用 p 标签包裹列表项) + parent = element.find_parent(['ul', 'ol']) + if parent: + is_ordered = parent.name == 'ol' + if is_ordered: + self._add_paragraph_with_links(doc, element, style='List Number') + else: + self._add_paragraph_with_links(doc, element, style='List Bullet') + else: + self._add_paragraph_with_links(doc, element) def crawl_page(self, url: str) -> dict | None: """ @@ -524,6 +607,8 @@ class BaseCrawler(ABC): # 处理图片 images = self.process_images(content, url) + # 处理链接(相对链接转绝对链接) + self.process_links(content, url) # 转换为 Markdown(传入标题,用于去除重复的h1标签) markdown = self.content_to_markdown(content, title) @@ -536,35 +621,6 @@ class BaseCrawler(ABC): "images": images, } - def save_single_page(self, page_data: dict): - """ - 保存单个页面为独立的 md 和 docx 文件 - - Args: - page_data: 页面数据字典 - """ - title = page_data["title"] - safe_title = safe_filename(title) - - # 保存 Markdown - md_path = os.path.join(self.output_dir, f"{safe_title}.md") - md_content = f"# {title}\n\n" - md_content += f"**原文链接**: {page_data['url']}\n\n" - md_content += page_data["markdown"] - - with open(md_path, "w", encoding="utf-8") as f: - f.write(md_content) - - # 保存 Word - docx_path = os.path.join(self.output_dir, f"{safe_title}.docx") - doc = _new_doc() - doc.add_heading(title, 0) - p = doc.add_paragraph() - p.add_run(f"原文链接: {page_data['url']}").italic = True - - self.add_content_to_docx(doc, page_data["content"], self.output_dir, title) - doc.save(docx_path) - def save_combined_documents(self, all_pages: list[dict]): """ 将所有页面汇总保存为一个 md 和 docx 文件 @@ -718,15 +774,14 @@ class BaseCrawler(ABC): if new_pages_for_doc: # 添加新内容 for page in new_pages_for_doc: - doc.add_heading(page["title"], level=1) + # 与汇总 Markdown 保持一致:每页标题使用二级标题(##) + doc.add_heading(page["title"], level=2) p = doc.add_paragraph() p.add_run(f"原文链接: {page['url']}").italic = True self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"]) doc.add_page_break() doc.save(docx_path) print(f" 追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档") - # 后处理:优化连续标题 - post_process_docx_headings(docx_path) else: print(f" Word 文档无需更新: {docx_path}") else: @@ -752,7 +807,8 @@ class BaseCrawler(ABC): doc.add_paragraph() # 空行 for page in all_pages: - doc.add_heading(page["title"], level=1) + # 与汇总 Markdown 保持一致:每页标题使用二级标题(##) + doc.add_heading(page["title"], level=2) p = doc.add_paragraph() p.add_run(f"原文链接: {page['url']}").italic = True self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"]) @@ -760,8 +816,6 @@ class BaseCrawler(ABC): doc.save(docx_path) print(f" 汇总 Word: {docx_path}") - # 后处理:优化连续标题 - post_process_docx_headings(docx_path) def run(self): """ diff --git a/zeroerr_crawler/product_crawler.py b/zeroerr_crawler/product_crawler.py index 6a4b862..16bc0b8 100644 --- a/zeroerr_crawler/product_crawler.py +++ b/zeroerr_crawler/product_crawler.py @@ -136,6 +136,11 @@ class ProductCrawler(BaseCrawler): h2.decompose() break # 只移除第一个匹配的 + # 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级) + heading_elements = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) + unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else [] + level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)} + for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']): # 跳过嵌套元素 if element.find_parent(['table', 'li']): @@ -162,16 +167,11 @@ class ProductCrawler(BaseCrawler): elif element.name.startswith('h'): text = element.get_text(strip=True) if text and '零差云控' not in text: - # 对于页面内容中的标题,h1 转换为 Heading 2,h2-h6 保持原层级 - # 因为页面标题已经是 Heading 1,所以内容中的 h1 应该降级为 Heading 2 + # 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级) original_level = int(element.name[1]) - if original_level == 1: - # 页面内容中的 h1 转换为 Heading 2 - word_level = 2 - print(f" 标题层级转换: h1 '{text}' → Heading 2") - else: - # h2-h6 保持原层级(h2→Heading 2, h3→Heading 3, ...) - word_level = original_level + word_level = level_map.get(original_level, 3) + if word_level != original_level: + print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}") doc.add_heading(text, level=min(word_level, 9)) elif element.name == 'table': @@ -179,21 +179,15 @@ class ProductCrawler(BaseCrawler): self._add_table_to_docx(doc, element) elif element.name == 'li': - text = element.get_text(strip=True) - if text: - doc.add_paragraph(f"• {text}") + self._add_paragraph_with_links(doc, element, prefix="• ") elif element.name == 'p': - text = element.get_text(strip=True) - if text: - doc.add_paragraph(text) + self._add_paragraph_with_links(doc, element) elif element.name == 'div': # 处理特殊的 div 内容块 if element.get('class') and any('param' in c for c in element.get('class', [])): - text = element.get_text(strip=True) - if text: - doc.add_paragraph(text) + self._add_paragraph_with_links(doc, element) def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup): """