优化摘要配置加载与文档输出兼容性,并补充本地配置忽略规则。

通过引入 config.yaml.example 和环境变量覆盖提升可配置性,同时统一 Word 默认中文字体并忽略本地 config.yaml,避免敏感信息误提交。

Made-with: Cursor
This commit is contained in:
Oo
2026-03-26 09:39:07 +08:00
parent dbe9ba3629
commit d257cbaed3
4 changed files with 86 additions and 11 deletions

14
.gitignore vendored
View File

@@ -43,3 +43,17 @@ output_post/
.DS_Store .DS_Store
Thumbs.db Thumbs.db
# 测试与工具缓存
.pytest_cache/
.mypy_cache/
.ruff_cache/
.coverage
htmlcov/
# 本地环境变量
.env
.env.*
# 项目本地配置(示例文件应保留跟踪)
config.yaml

8
config.yaml.example Executable file
View File

@@ -0,0 +1,8 @@
# 零差云控爬虫配置模板
# 复制此文件为 config.yaml 并填入实际值
llm:
base_url: "https://yiming.zeroerr.team/v1"
api_key: "your-api-key-here"
model: "minimax-2.5"
max_tokens: 40960

View File

@@ -11,8 +11,9 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import markdownify import markdownify
from docx import Document from docx import Document
from docx.shared import Inches from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from urllib.parse import urljoin from urllib.parse import urljoin
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
@@ -22,6 +23,16 @@ from .extract_abstract import generate_abstract
from .post_process import post_process_docx_headings from .post_process import post_process_docx_headings
def _new_doc() -> Document:
"""创建 Document 并设置中文字体,确保 WPS/Word 均能正确显示中文。"""
doc = Document()
normal = doc.styles['Normal']
normal.font.name = 'Calibri'
normal.font.size = Pt(10.5)
normal.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
return doc
class BaseCrawler(ABC): class BaseCrawler(ABC):
""" """
基础爬虫类 基础爬虫类
@@ -546,7 +557,7 @@ class BaseCrawler(ABC):
# 保存 Word # 保存 Word
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx") docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
doc = Document() doc = _new_doc()
doc.add_heading(title, 0) doc.add_heading(title, 0)
p = doc.add_paragraph() p = doc.add_paragraph()
p.add_run(f"原文链接: {page_data['url']}").italic = True p.add_run(f"原文链接: {page_data['url']}").italic = True
@@ -720,7 +731,7 @@ class BaseCrawler(ABC):
print(f" Word 文档无需更新: {docx_path}") print(f" Word 文档无需更新: {docx_path}")
else: else:
# 新建Word文档 # 新建Word文档
doc = Document() doc = _new_doc()
doc.add_heading(f'{output_dir_name}全集', level=1) doc.add_heading(f'{output_dir_name}全集', level=1)
# 添加摘要只在新建时生成复用Markdown部分生成的摘要 # 添加摘要只在新建时生成复用Markdown部分生成的摘要

View File

@@ -3,12 +3,49 @@
使用大模型生成文档摘要 使用大模型生成文档摘要
""" """
import os
import re
from openai import OpenAI from openai import OpenAI
# API 配置
API_BASE_URL = "https://yiming.zeroerr.team/v1" def _load_llm_config() -> dict:
API_KEY = "sk-LX1g8KkG61S6eUaVD567C0C187D4452c90F9E6985cDf3586" """从 config.yaml 加载 LLM 配置,环境变量可覆盖。"""
MODEL = "Yiming" config = {
"base_url": "https://yiming.zeroerr.team/v1",
"api_key": "",
"model": "minimax-2.5",
"max_tokens": 40960,
}
# 尝试从项目根目录 config.yaml 读取(与 0209 一致)
config_path = os.path.join(os.path.dirname(__file__), "..", "config.yaml")
if os.path.exists(config_path):
try:
import yaml
with open(config_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
llm = data.get("llm", {})
config.update({k: v for k, v in llm.items() if v})
except Exception:
# 保持静默降级,继续使用默认值/环境变量
pass
# 环境变量优先级更高
config["base_url"] = os.environ.get("ZEROERR_LLM_BASE_URL", config["base_url"])
config["api_key"] = os.environ.get("ZEROERR_LLM_API_KEY", config["api_key"])
config["model"] = os.environ.get("ZEROERR_LLM_MODEL", config["model"])
config["max_tokens"] = int(
os.environ.get("ZEROERR_LLM_MAX_TOKENS", config["max_tokens"])
)
return config
_LLM_CONFIG = _load_llm_config()
API_BASE_URL = _LLM_CONFIG["base_url"]
API_KEY = _LLM_CONFIG["api_key"]
MODEL = _LLM_CONFIG["model"]
MAX_TOKENS = _LLM_CONFIG["max_tokens"]
def generate_abstract(all_pages: list[dict], category_name: str, index_url: str = None) -> str: def generate_abstract(all_pages: list[dict], category_name: str, index_url: str = None) -> str:
@@ -26,6 +63,10 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str
if not all_pages: if not all_pages:
return "" return ""
if not API_KEY:
print(" 警告: 未设置 ZEROERR_LLM_API_KEY跳过摘要生成")
return ""
try: try:
# 构建文档内容(用于生成摘要) # 构建文档内容(用于生成摘要)
# 只使用标题和部分内容,避免内容过长 # 只使用标题和部分内容,避免内容过长
@@ -62,12 +103,13 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str
response = client.chat.completions.create( response = client.chat.completions.create(
model=MODEL, model=MODEL,
temperature=0.3, # 使用较低的温度值,保证摘要的准确性 temperature=0.3, # 使用较低的温度值,保证摘要的准确性
messages=[ max_tokens=MAX_TOKENS,
{"role": "user", "content": prompt} messages=[{"role": "user", "content": prompt}],
]
) )
abstract_text = response.choices[0].message.content.strip() abstract_text = response.choices[0].message.content.strip()
# 过滤掉 <think>...</think> 推理过程
abstract_text = re.sub(r"<think>.*?</think>\s*", "", abstract_text, flags=re.DOTALL).strip()
# 构建链接列表 # 构建链接列表
links_section = "\n\n**相关链接:**\n\n" links_section = "\n\n**相关链接:**\n\n"