优化摘要配置加载与文档输出兼容性,并补充本地配置忽略规则。

通过引入 config.yaml.example 和环境变量覆盖提升可配置性,同时统一 Word 默认中文字体并忽略本地 config.yaml,避免敏感信息误提交。

Made-with: Cursor
This commit is contained in:
Oo
2026-03-26 09:39:07 +08:00
parent dbe9ba3629
commit d257cbaed3
4 changed files with 86 additions and 11 deletions

View File

@@ -11,8 +11,9 @@ import requests
from bs4 import BeautifulSoup
import markdownify
from docx import Document
from docx.shared import Inches
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from urllib.parse import urljoin
from abc import ABC, abstractmethod
@@ -22,6 +23,16 @@ from .extract_abstract import generate_abstract
from .post_process import post_process_docx_headings
def _new_doc() -> Document:
"""创建 Document 并设置中文字体,确保 WPS/Word 均能正确显示中文。"""
doc = Document()
normal = doc.styles['Normal']
normal.font.name = 'Calibri'
normal.font.size = Pt(10.5)
normal.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
return doc
class BaseCrawler(ABC):
"""
基础爬虫类
@@ -546,7 +557,7 @@ class BaseCrawler(ABC):
# 保存 Word
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
doc = Document()
doc = _new_doc()
doc.add_heading(title, 0)
p = doc.add_paragraph()
p.add_run(f"原文链接: {page_data['url']}").italic = True
@@ -720,7 +731,7 @@ class BaseCrawler(ABC):
print(f" Word 文档无需更新: {docx_path}")
else:
# 新建Word文档
doc = Document()
doc = _new_doc()
doc.add_heading(f'{output_dir_name}全集', level=1)
# 添加摘要只在新建时生成复用Markdown部分生成的摘要