优化文档导出层级与链接保真,统一正文标题映射并增强 Word 段落超链接处理。
同时移除不再使用的文档后处理依赖,减少汇总导出流程中的冗余步骤。 Made-with: Cursor
This commit is contained in:
@@ -8,19 +8,20 @@ import time
|
||||
import copy
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
import markdownify
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
from docx.opc.constants import RELATIONSHIP_TYPE as RT
|
||||
from urllib.parse import urljoin
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .config import BASE_URL, HEADERS, REQUEST_DELAY, OUTPUT_DIR
|
||||
from .utils import ensure_dir, download_image, safe_filename, make_absolute_url
|
||||
from .extract_abstract import generate_abstract
|
||||
from .post_process import post_process_docx_headings
|
||||
|
||||
|
||||
def _new_doc() -> Document:
|
||||
@@ -354,6 +355,88 @@ class BaseCrawler(ABC):
|
||||
img['src'] = full_url
|
||||
|
||||
return images_info
|
||||
|
||||
def process_links(self, content: BeautifulSoup, page_url: str):
|
||||
"""
|
||||
处理内容中的链接,将相对链接转换为绝对链接
|
||||
|
||||
Args:
|
||||
content: 内容区域
|
||||
page_url: 页面URL(用于解析相对路径)
|
||||
"""
|
||||
for a in content.find_all('a', href=True):
|
||||
href = a.get('href', '').strip()
|
||||
if not href:
|
||||
continue
|
||||
# 保留电话/邮箱/锚点等特殊链接
|
||||
if href.startswith(('mailto:', 'tel:', '#', 'javascript:')):
|
||||
continue
|
||||
a['href'] = make_absolute_url(page_url, href)
|
||||
|
||||
def _add_hyperlink(self, paragraph, text: str, url: str):
|
||||
"""向段落添加可点击的超链接。"""
|
||||
if not text or not url:
|
||||
return
|
||||
part = paragraph.part
|
||||
r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
|
||||
hyperlink = OxmlElement('w:hyperlink')
|
||||
hyperlink.set(qn('r:id'), r_id)
|
||||
|
||||
run = OxmlElement('w:r')
|
||||
r_pr = OxmlElement('w:rPr')
|
||||
r_style = OxmlElement('w:rStyle')
|
||||
r_style.set(qn('w:val'), 'Hyperlink')
|
||||
r_pr.append(r_style)
|
||||
run.append(r_pr)
|
||||
|
||||
text_element = OxmlElement('w:t')
|
||||
text_element.text = text
|
||||
run.append(text_element)
|
||||
hyperlink.append(run)
|
||||
paragraph._p.append(hyperlink)
|
||||
|
||||
def _append_inline_nodes_to_paragraph(self, paragraph, node):
|
||||
"""递归写入段落内联内容,保留 a 标签为超链接。"""
|
||||
if isinstance(node, NavigableString):
|
||||
text = re.sub(r'\s+', ' ', str(node))
|
||||
if text:
|
||||
paragraph.add_run(text)
|
||||
return
|
||||
if not isinstance(node, Tag):
|
||||
return
|
||||
if node.name == 'br':
|
||||
paragraph.add_run('\n')
|
||||
return
|
||||
if node.name == 'a':
|
||||
link_text = node.get_text(' ', strip=True)
|
||||
href = (node.get('href') or '').strip()
|
||||
if link_text:
|
||||
display_text = f"{link_text} ({href})" if href else link_text
|
||||
if paragraph.text.strip():
|
||||
paragraph.add_run(" ")
|
||||
if href:
|
||||
self._add_hyperlink(paragraph, display_text, href)
|
||||
else:
|
||||
paragraph.add_run(display_text)
|
||||
return
|
||||
for child in node.children:
|
||||
self._append_inline_nodes_to_paragraph(paragraph, child)
|
||||
|
||||
def _add_paragraph_with_links(self, doc: Document, element: Tag, style: str | None = None, prefix: str = ""):
|
||||
"""添加段落并保留其中超链接。"""
|
||||
if not element.get_text(strip=True):
|
||||
return
|
||||
if style:
|
||||
paragraph = doc.add_paragraph(style=style)
|
||||
else:
|
||||
paragraph = doc.add_paragraph()
|
||||
if prefix:
|
||||
paragraph.add_run(prefix)
|
||||
if element.find('a'):
|
||||
for child in element.children:
|
||||
self._append_inline_nodes_to_paragraph(paragraph, child)
|
||||
else:
|
||||
paragraph.add_run(element.get_text(strip=True))
|
||||
|
||||
def content_to_markdown(self, content: BeautifulSoup, page_title: str = None) -> str:
|
||||
"""
|
||||
@@ -397,10 +480,18 @@ class BaseCrawler(ABC):
|
||||
h2.decompose()
|
||||
break # 只移除第一个匹配的
|
||||
|
||||
# 页面内容中的 h1 降级为 h2(与 Word 文档处理一致)
|
||||
# 因为页面标题已经是二级标题(##),所以内容中的 h1 应该降级为二级标题
|
||||
for h1 in content_copy.find_all('h1'):
|
||||
h1.name = 'h2'
|
||||
# 正文标题统一映射:每页正文从 h3 起步,并压缩为连续层级(不跳级)
|
||||
# 例如:
|
||||
# - 若正文有 h2/h4:映射为 h3/h4(而不是 h3/h5)
|
||||
# - 若正文有 h1/h3/h6:映射为 h3/h4/h5
|
||||
body_headings = content_copy.find_all(re.compile(r'^h[1-6]$'))
|
||||
if body_headings:
|
||||
unique_levels = sorted({int(h.name[1]) for h in body_headings})
|
||||
level_map = {level: min(i + 3, 6) for i, level in enumerate(unique_levels)}
|
||||
for heading in body_headings:
|
||||
original_level = int(heading.name[1])
|
||||
new_level = level_map[original_level]
|
||||
heading.name = f'h{new_level}'
|
||||
|
||||
return markdownify.markdownify(str(content_copy), heading_style="ATX")
|
||||
|
||||
@@ -423,6 +514,11 @@ class BaseCrawler(ABC):
|
||||
if h1_text == page_title:
|
||||
first_h1.decompose() # 移除该标签
|
||||
|
||||
# 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
|
||||
heading_elements = content.find_all(re.compile(r'^h[1-6]$'))
|
||||
unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
|
||||
level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
|
||||
|
||||
# 按文档顺序处理元素,保持列表的连续性
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']):
|
||||
if element.name == 'img':
|
||||
@@ -447,16 +543,11 @@ class BaseCrawler(ABC):
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
# 对于页面内容中的标题,h1 转换为 Heading 2,h2-h6 保持原层级
|
||||
# 因为页面标题已经是 Heading 1,所以内容中的 h1 应该降级为 Heading 2
|
||||
# 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
|
||||
original_level = int(element.name[1])
|
||||
if original_level == 1:
|
||||
# 页面内容中的 h1 转换为 Heading 2
|
||||
word_level = 2
|
||||
print(f" 标题层级转换: h1 '{text}' → Heading 2")
|
||||
else:
|
||||
# h2-h6 保持原层级(h2→Heading 2, h3→Heading 3, ...)
|
||||
word_level = original_level
|
||||
word_level = level_map.get(original_level, 3)
|
||||
if word_level != original_level:
|
||||
print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}")
|
||||
doc.add_heading(text, level=min(word_level, 9))
|
||||
|
||||
elif element.name in ['ul', 'ol']:
|
||||
@@ -464,37 +555,29 @@ class BaseCrawler(ABC):
|
||||
continue
|
||||
|
||||
elif element.name == 'li':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
# 检查父元素是 ul 还是 ol
|
||||
parent = element.find_parent(['ul', 'ol'])
|
||||
is_ordered = parent and parent.name == 'ol'
|
||||
|
||||
# 使用列表样式
|
||||
if is_ordered:
|
||||
doc.add_paragraph(text, style='List Number')
|
||||
else:
|
||||
doc.add_paragraph(text, style='List Bullet')
|
||||
# 检查父元素是 ul 还是 ol
|
||||
parent = element.find_parent(['ul', 'ol'])
|
||||
is_ordered = parent and parent.name == 'ol'
|
||||
if is_ordered:
|
||||
self._add_paragraph_with_links(doc, element, style='List Number')
|
||||
else:
|
||||
self._add_paragraph_with_links(doc, element, style='List Bullet')
|
||||
|
||||
elif element.name == 'table':
|
||||
# 处理表格,创建 Word 表格结构(便于 doc2md.py 解析)
|
||||
self._add_table_to_docx(doc, element)
|
||||
|
||||
elif element.name == 'p':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
# 跳过空段落和只包含空白字符的段落
|
||||
if text.strip():
|
||||
# 检查是否是列表项(某些网站用 p 标签包裹列表项)
|
||||
parent = element.find_parent(['ul', 'ol'])
|
||||
if parent:
|
||||
is_ordered = parent.name == 'ol'
|
||||
if is_ordered:
|
||||
doc.add_paragraph(text, style='List Number')
|
||||
else:
|
||||
doc.add_paragraph(text, style='List Bullet')
|
||||
else:
|
||||
doc.add_paragraph(text)
|
||||
# 检查是否是列表项(某些网站用 p 标签包裹列表项)
|
||||
parent = element.find_parent(['ul', 'ol'])
|
||||
if parent:
|
||||
is_ordered = parent.name == 'ol'
|
||||
if is_ordered:
|
||||
self._add_paragraph_with_links(doc, element, style='List Number')
|
||||
else:
|
||||
self._add_paragraph_with_links(doc, element, style='List Bullet')
|
||||
else:
|
||||
self._add_paragraph_with_links(doc, element)
|
||||
|
||||
def crawl_page(self, url: str) -> dict | None:
|
||||
"""
|
||||
@@ -524,6 +607,8 @@ class BaseCrawler(ABC):
|
||||
|
||||
# 处理图片
|
||||
images = self.process_images(content, url)
|
||||
# 处理链接(相对链接转绝对链接)
|
||||
self.process_links(content, url)
|
||||
|
||||
# 转换为 Markdown(传入标题,用于去除重复的h1标签)
|
||||
markdown = self.content_to_markdown(content, title)
|
||||
@@ -536,35 +621,6 @@ class BaseCrawler(ABC):
|
||||
"images": images,
|
||||
}
|
||||
|
||||
def save_single_page(self, page_data: dict):
|
||||
"""
|
||||
保存单个页面为独立的 md 和 docx 文件
|
||||
|
||||
Args:
|
||||
page_data: 页面数据字典
|
||||
"""
|
||||
title = page_data["title"]
|
||||
safe_title = safe_filename(title)
|
||||
|
||||
# 保存 Markdown
|
||||
md_path = os.path.join(self.output_dir, f"{safe_title}.md")
|
||||
md_content = f"# {title}\n\n"
|
||||
md_content += f"**原文链接**: {page_data['url']}\n\n"
|
||||
md_content += page_data["markdown"]
|
||||
|
||||
with open(md_path, "w", encoding="utf-8") as f:
|
||||
f.write(md_content)
|
||||
|
||||
# 保存 Word
|
||||
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
|
||||
doc = _new_doc()
|
||||
doc.add_heading(title, 0)
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"原文链接: {page_data['url']}").italic = True
|
||||
|
||||
self.add_content_to_docx(doc, page_data["content"], self.output_dir, title)
|
||||
doc.save(docx_path)
|
||||
|
||||
def save_combined_documents(self, all_pages: list[dict]):
|
||||
"""
|
||||
将所有页面汇总保存为一个 md 和 docx 文件
|
||||
@@ -718,15 +774,14 @@ class BaseCrawler(ABC):
|
||||
if new_pages_for_doc:
|
||||
# 添加新内容
|
||||
for page in new_pages_for_doc:
|
||||
doc.add_heading(page["title"], level=1)
|
||||
# 与汇总 Markdown 保持一致:每页标题使用二级标题(##)
|
||||
doc.add_heading(page["title"], level=2)
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"原文链接: {page['url']}").italic = True
|
||||
self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"])
|
||||
doc.add_page_break()
|
||||
doc.save(docx_path)
|
||||
print(f" 追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档")
|
||||
# 后处理:优化连续标题
|
||||
post_process_docx_headings(docx_path)
|
||||
else:
|
||||
print(f" Word 文档无需更新: {docx_path}")
|
||||
else:
|
||||
@@ -752,7 +807,8 @@ class BaseCrawler(ABC):
|
||||
doc.add_paragraph() # 空行
|
||||
|
||||
for page in all_pages:
|
||||
doc.add_heading(page["title"], level=1)
|
||||
# 与汇总 Markdown 保持一致:每页标题使用二级标题(##)
|
||||
doc.add_heading(page["title"], level=2)
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"原文链接: {page['url']}").italic = True
|
||||
self.add_content_to_docx(doc, page["content"], self.output_dir, page["title"])
|
||||
@@ -760,8 +816,6 @@ class BaseCrawler(ABC):
|
||||
|
||||
doc.save(docx_path)
|
||||
print(f" 汇总 Word: {docx_path}")
|
||||
# 后处理:优化连续标题
|
||||
post_process_docx_headings(docx_path)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
|
||||
@@ -136,6 +136,11 @@ class ProductCrawler(BaseCrawler):
|
||||
h2.decompose()
|
||||
break # 只移除第一个匹配的
|
||||
|
||||
# 计算正文标题映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
|
||||
heading_elements = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
||||
unique_levels = sorted({int(h.name[1]) for h in heading_elements}) if heading_elements else []
|
||||
level_map = {level: min(i + 3, 9) for i, level in enumerate(unique_levels)}
|
||||
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
|
||||
# 跳过嵌套元素
|
||||
if element.find_parent(['table', 'li']):
|
||||
@@ -162,16 +167,11 @@ class ProductCrawler(BaseCrawler):
|
||||
elif element.name.startswith('h'):
|
||||
text = element.get_text(strip=True)
|
||||
if text and '零差云控' not in text:
|
||||
# 对于页面内容中的标题,h1 转换为 Heading 2,h2-h6 保持原层级
|
||||
# 因为页面标题已经是 Heading 1,所以内容中的 h1 应该降级为 Heading 2
|
||||
# 正文标题统一映射:每页从 Heading 3 起步,并压缩为连续层级(不跳级)
|
||||
original_level = int(element.name[1])
|
||||
if original_level == 1:
|
||||
# 页面内容中的 h1 转换为 Heading 2
|
||||
word_level = 2
|
||||
print(f" 标题层级转换: h1 '{text}' → Heading 2")
|
||||
else:
|
||||
# h2-h6 保持原层级(h2→Heading 2, h3→Heading 3, ...)
|
||||
word_level = original_level
|
||||
word_level = level_map.get(original_level, 3)
|
||||
if word_level != original_level:
|
||||
print(f" 标题层级转换: h{original_level} '{text}' → Heading {word_level}")
|
||||
doc.add_heading(text, level=min(word_level, 9))
|
||||
|
||||
elif element.name == 'table':
|
||||
@@ -179,21 +179,15 @@ class ProductCrawler(BaseCrawler):
|
||||
self._add_table_to_docx(doc, element)
|
||||
|
||||
elif element.name == 'li':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(f"• {text}")
|
||||
self._add_paragraph_with_links(doc, element, prefix="• ")
|
||||
|
||||
elif element.name == 'p':
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
self._add_paragraph_with_links(doc, element)
|
||||
|
||||
elif element.name == 'div':
|
||||
# 处理特殊的 div 内容块
|
||||
if element.get('class') and any('param' in c for c in element.get('class', [])):
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_paragraph(text)
|
||||
self._add_paragraph_with_links(doc, element)
|
||||
|
||||
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user