优化摘要配置加载与文档输出兼容性,并补充本地配置忽略规则。
通过引入 config.yaml.example 和环境变量覆盖提升可配置性,同时统一 Word 默认中文字体并忽略本地 config.yaml,避免敏感信息误提交。 Made-with: Cursor
This commit is contained in:
@@ -11,8 +11,9 @@ import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import markdownify
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.oxml.ns import qn
|
||||
from urllib.parse import urljoin
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
@@ -22,6 +23,16 @@ from .extract_abstract import generate_abstract
|
||||
from .post_process import post_process_docx_headings
|
||||
|
||||
|
||||
def _new_doc() -> Document:
|
||||
"""创建 Document 并设置中文字体,确保 WPS/Word 均能正确显示中文。"""
|
||||
doc = Document()
|
||||
normal = doc.styles['Normal']
|
||||
normal.font.name = 'Calibri'
|
||||
normal.font.size = Pt(10.5)
|
||||
normal.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||||
return doc
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""
|
||||
基础爬虫类
|
||||
@@ -546,7 +557,7 @@ class BaseCrawler(ABC):
|
||||
|
||||
# 保存 Word
|
||||
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
|
||||
doc = Document()
|
||||
doc = _new_doc()
|
||||
doc.add_heading(title, 0)
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"原文链接: {page_data['url']}").italic = True
|
||||
@@ -720,7 +731,7 @@ class BaseCrawler(ABC):
|
||||
print(f" Word 文档无需更新: {docx_path}")
|
||||
else:
|
||||
# 新建Word文档
|
||||
doc = Document()
|
||||
doc = _new_doc()
|
||||
doc.add_heading(f'{output_dir_name}全集', level=1)
|
||||
|
||||
# 添加摘要(只在新建时生成,复用Markdown部分生成的摘要)
|
||||
|
||||
Reference in New Issue
Block a user