From d257cbaed32a97ededa2979235316fc65981e73a Mon Sep 17 00:00:00 2001
From: Oo <oo@OodeMacBook-Pro.local>
Date: Thu, 26 Mar 2026 09:39:07 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=91=98=E8=A6=81=E9=85=8D?=
 =?UTF-8?q?=E7=BD=AE=E5=8A=A0=E8=BD=BD=E4=B8=8E=E6=96=87=E6=A1=A3=E8=BE=93?=
 =?UTF-8?q?=E5=87=BA=E5=85=BC=E5=AE=B9=E6=80=A7=EF=BC=8C=E5=B9=B6=E8=A1=A5?=
 =?UTF-8?q?=E5=85=85=E6=9C=AC=E5=9C=B0=E9=85=8D=E7=BD=AE=E5=BF=BD=E7=95=A5?=
 =?UTF-8?q?=E8=A7=84=E5=88=99=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

通过引入 config.yaml.example 和环境变量覆盖提升可配置性，同时统一 Word 默认中文字体并忽略本地 config.yaml，避免敏感信息误提交。

Made-with: Cursor
---
 .gitignore                          | 14 +++++++
 config.yaml.example                 |  8 ++++
 zeroerr_crawler/base_crawler.py     | 17 +++++++--
 zeroerr_crawler/extract_abstract.py | 58 +++++++++++++++++++++++++----
 4 files changed, 86 insertions(+), 11 deletions(-)
 create mode 100755 config.yaml.example

diff --git a/.gitignore b/.gitignore
index f15ec3d..3cddd34 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,17 @@ output_post/
 .DS_Store
 Thumbs.db
 
+# 测试与工具缓存
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+htmlcov/
+
+# 本地环境变量
+.env
+.env.*
+
+# 项目本地配置（示例文件应保留跟踪）
+config.yaml
+
diff --git a/config.yaml.example b/config.yaml.example
new file mode 100755
index 0000000..413767b
--- /dev/null
+++ b/config.yaml.example
@@ -0,0 +1,8 @@
+# 零差云控爬虫配置模板
+# 复制此文件为 config.yaml 并填入实际值
+
+llm:
+  base_url: "https://yiming.zeroerr.team/v1"
+  api_key: "your-api-key-here"
+  model: "minimax-2.5"
+  max_tokens: 40960
diff --git a/zeroerr_crawler/base_crawler.py b/zeroerr_crawler/base_crawler.py
index 56ddd1a..76e5bd9 100644
--- a/zeroerr_crawler/base_crawler.py
+++ b/zeroerr_crawler/base_crawler.py
@@ -11,8 +11,9 @@ import requests
 from bs4 import BeautifulSoup
 import markdownify
 from docx import Document
-from docx.shared import Inches
+from docx.shared import Inches, Pt
 from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.oxml.ns import qn
 from urllib.parse import urljoin
 from abc import ABC, abstractmethod
 
@@ -22,6 +23,16 @@ from .extract_abstract import generate_abstract
 from .post_process import post_process_docx_headings
 
 
+def _new_doc() -> Document:
+    """创建 Document 并设置中文字体，确保 WPS/Word 均能正确显示中文。"""
+    doc = Document()
+    normal = doc.styles['Normal']
+    normal.font.name = 'Calibri'
+    normal.font.size = Pt(10.5)
+    normal.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
+    return doc
+
+
 class BaseCrawler(ABC):
     """
     基础爬虫类
@@ -546,7 +557,7 @@ class BaseCrawler(ABC):
         
         # 保存 Word
         docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
-        doc = Document()
+        doc = _new_doc()
         doc.add_heading(title, 0)
         p = doc.add_paragraph()
         p.add_run(f"原文链接: {page_data['url']}").italic = True
@@ -720,7 +731,7 @@ class BaseCrawler(ABC):
                 print(f"  Word 文档无需更新: {docx_path}")
         else:
             # 新建Word文档
-            doc = Document()
+            doc = _new_doc()
             doc.add_heading(f'{output_dir_name}全集', level=1)
             
             # 添加摘要（只在新建时生成，复用Markdown部分生成的摘要）
diff --git a/zeroerr_crawler/extract_abstract.py b/zeroerr_crawler/extract_abstract.py
index f73ffae..3dd12df 100644
--- a/zeroerr_crawler/extract_abstract.py
+++ b/zeroerr_crawler/extract_abstract.py
@@ -3,12 +3,49 @@
 使用大模型生成文档摘要
 """
 
+import os
+import re
 from openai import OpenAI
 
-# API 配置
-API_BASE_URL = "https://yiming.zeroerr.team/v1"
-API_KEY = "sk-LX1g8KkG61S6eUaVD567C0C187D4452c90F9E6985cDf3586"
-MODEL = "Yiming"
+
+def _load_llm_config() -> dict:
+    """从 config.yaml 加载 LLM 配置，环境变量可覆盖。"""
+    config = {
+        "base_url": "https://yiming.zeroerr.team/v1",
+        "api_key": "",
+        "model": "minimax-2.5",
+        "max_tokens": 40960,
+    }
+
+    # 尝试从项目根目录 config.yaml 读取（与 0209 一致）
+    config_path = os.path.join(os.path.dirname(__file__), "..", "config.yaml")
+    if os.path.exists(config_path):
+        try:
+            import yaml
+
+            with open(config_path, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f) or {}
+            llm = data.get("llm", {})
+            config.update({k: v for k, v in llm.items() if v})
+        except Exception:
+            # 保持静默降级，继续使用默认值/环境变量
+            pass
+
+    # 环境变量优先级更高
+    config["base_url"] = os.environ.get("ZEROERR_LLM_BASE_URL", config["base_url"])
+    config["api_key"] = os.environ.get("ZEROERR_LLM_API_KEY", config["api_key"])
+    config["model"] = os.environ.get("ZEROERR_LLM_MODEL", config["model"])
+    config["max_tokens"] = int(
+        os.environ.get("ZEROERR_LLM_MAX_TOKENS", config["max_tokens"])
+    )
+    return config
+
+
+_LLM_CONFIG = _load_llm_config()
+API_BASE_URL = _LLM_CONFIG["base_url"]
+API_KEY = _LLM_CONFIG["api_key"]
+MODEL = _LLM_CONFIG["model"]
+MAX_TOKENS = _LLM_CONFIG["max_tokens"]
 
 
 def generate_abstract(all_pages: list[dict], category_name: str, index_url: str = None) -> str:
@@ -26,6 +63,10 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str
     if not all_pages:
         return ""
     
+    if not API_KEY:
+        print("  警告: 未设置 ZEROERR_LLM_API_KEY，跳过摘要生成")
+        return ""
+
     try:
         # 构建文档内容（用于生成摘要）
         # 只使用标题和部分内容，避免内容过长
@@ -62,12 +103,13 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str
         response = client.chat.completions.create(
             model=MODEL,
             temperature=0.3,  # 使用较低的温度值，保证摘要的准确性
-            messages=[
-                {"role": "user", "content": prompt}
-            ]
+            max_tokens=MAX_TOKENS,
+            messages=[{"role": "user", "content": prompt}],
         )
-        
+
         abstract_text = response.choices[0].message.content.strip()
+        # 过滤掉 <think>...</think> 推理过程
+        abstract_text = re.sub(r"<think>.*?</think>\s*", "", abstract_text, flags=re.DOTALL).strip()
         
         # 构建链接列表
         links_section = "\n\n**相关链接：**\n\n"