From d257cbaed32a97ededa2979235316fc65981e73a Mon Sep 17 00:00:00 2001 From: Oo Date: Thu, 26 Mar 2026 09:39:07 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=91=98=E8=A6=81=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E5=8A=A0=E8=BD=BD=E4=B8=8E=E6=96=87=E6=A1=A3=E8=BE=93?= =?UTF-8?q?=E5=87=BA=E5=85=BC=E5=AE=B9=E6=80=A7=EF=BC=8C=E5=B9=B6=E8=A1=A5?= =?UTF-8?q?=E5=85=85=E6=9C=AC=E5=9C=B0=E9=85=8D=E7=BD=AE=E5=BF=BD=E7=95=A5?= =?UTF-8?q?=E8=A7=84=E5=88=99=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 通过引入 config.yaml.example 和环境变量覆盖提升可配置性,同时统一 Word 默认中文字体并忽略本地 config.yaml,避免敏感信息误提交。 Made-with: Cursor --- .gitignore | 14 +++++++ config.yaml.example | 8 ++++ zeroerr_crawler/base_crawler.py | 17 +++++++-- zeroerr_crawler/extract_abstract.py | 58 +++++++++++++++++++++++++---- 4 files changed, 86 insertions(+), 11 deletions(-) create mode 100755 config.yaml.example diff --git a/.gitignore b/.gitignore index f15ec3d..3cddd34 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,17 @@ output_post/ .DS_Store Thumbs.db +# 测试与工具缓存 +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ + +# 本地环境变量 +.env +.env.* + +# 项目本地配置(示例文件应保留跟踪) +config.yaml + diff --git a/config.yaml.example b/config.yaml.example new file mode 100755 index 0000000..413767b --- /dev/null +++ b/config.yaml.example @@ -0,0 +1,8 @@ +# 零差云控爬虫配置模板 +# 复制此文件为 config.yaml 并填入实际值 + +llm: + base_url: "https://yiming.zeroerr.team/v1" + api_key: "your-api-key-here" + model: "minimax-2.5" + max_tokens: 40960 diff --git a/zeroerr_crawler/base_crawler.py b/zeroerr_crawler/base_crawler.py index 56ddd1a..76e5bd9 100644 --- a/zeroerr_crawler/base_crawler.py +++ b/zeroerr_crawler/base_crawler.py @@ -11,8 +11,9 @@ import requests from bs4 import BeautifulSoup import markdownify from docx import Document -from docx.shared import Inches +from docx.shared import Inches, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml.ns import qn from urllib.parse import urljoin from abc import ABC, abstractmethod @@ -22,6 +23,16 @@ from .extract_abstract import generate_abstract from .post_process import post_process_docx_headings +def _new_doc() -> Document: + """创建 Document 并设置中文字体,确保 WPS/Word 均能正确显示中文。""" + doc = Document() + normal = doc.styles['Normal'] + normal.font.name = 'Calibri' + normal.font.size = Pt(10.5) + normal.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体') + return doc + + class BaseCrawler(ABC): """ 基础爬虫类 @@ -546,7 +557,7 @@ class BaseCrawler(ABC): # 保存 Word docx_path = os.path.join(self.output_dir, f"{safe_title}.docx") - doc = Document() + doc = _new_doc() doc.add_heading(title, 0) p = doc.add_paragraph() p.add_run(f"原文链接: {page_data['url']}").italic = True @@ -720,7 +731,7 @@ class BaseCrawler(ABC): print(f" Word 文档无需更新: {docx_path}") else: # 新建Word文档 - doc = Document() + doc = _new_doc() doc.add_heading(f'{output_dir_name}全集', level=1) # 添加摘要(只在新建时生成,复用Markdown部分生成的摘要) diff --git a/zeroerr_crawler/extract_abstract.py b/zeroerr_crawler/extract_abstract.py index f73ffae..3dd12df 100644 --- a/zeroerr_crawler/extract_abstract.py +++ b/zeroerr_crawler/extract_abstract.py @@ -3,12 +3,49 @@ 使用大模型生成文档摘要 """ +import os +import re from openai import OpenAI -# API 配置 -API_BASE_URL = "https://yiming.zeroerr.team/v1" -API_KEY = "sk-LX1g8KkG61S6eUaVD567C0C187D4452c90F9E6985cDf3586" -MODEL = "Yiming" + +def _load_llm_config() -> dict: + """从 config.yaml 加载 LLM 配置,环境变量可覆盖。""" + config = { + "base_url": "https://yiming.zeroerr.team/v1", + "api_key": "", + "model": "minimax-2.5", + "max_tokens": 40960, + } + + # 尝试从项目根目录 config.yaml 读取(与 0209 一致) + config_path = os.path.join(os.path.dirname(__file__), "..", "config.yaml") + if os.path.exists(config_path): + try: + import yaml + + with open(config_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + llm = data.get("llm", {}) + config.update({k: v for k, v in llm.items() if v}) + except Exception: + # 保持静默降级,继续使用默认值/环境变量 + pass + + # 环境变量优先级更高 + config["base_url"] = os.environ.get("ZEROERR_LLM_BASE_URL", config["base_url"]) + config["api_key"] = os.environ.get("ZEROERR_LLM_API_KEY", config["api_key"]) + config["model"] = os.environ.get("ZEROERR_LLM_MODEL", config["model"]) + config["max_tokens"] = int( + os.environ.get("ZEROERR_LLM_MAX_TOKENS", config["max_tokens"]) + ) + return config + + +_LLM_CONFIG = _load_llm_config() +API_BASE_URL = _LLM_CONFIG["base_url"] +API_KEY = _LLM_CONFIG["api_key"] +MODEL = _LLM_CONFIG["model"] +MAX_TOKENS = _LLM_CONFIG["max_tokens"] def generate_abstract(all_pages: list[dict], category_name: str, index_url: str = None) -> str: @@ -26,6 +63,10 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str if not all_pages: return "" + if not API_KEY: + print(" 警告: 未设置 ZEROERR_LLM_API_KEY,跳过摘要生成") + return "" + try: # 构建文档内容(用于生成摘要) # 只使用标题和部分内容,避免内容过长 @@ -62,12 +103,13 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str response = client.chat.completions.create( model=MODEL, temperature=0.3, # 使用较低的温度值,保证摘要的准确性 - messages=[ - {"role": "user", "content": prompt} - ] + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}], ) - + abstract_text = response.choices[0].message.content.strip() + # 过滤掉 ... 推理过程 + abstract_text = re.sub(r".*?\s*", "", abstract_text, flags=re.DOTALL).strip() # 构建链接列表 links_section = "\n\n**相关链接:**\n\n"