优化摘要配置加载与文档输出兼容性,并补充本地配置忽略规则。
通过引入 config.yaml.example 和环境变量覆盖提升可配置性,同时统一 Word 默认中文字体并忽略本地 config.yaml,避免敏感信息误提交。 Made-with: Cursor
This commit is contained in:
14
.gitignore
vendored
14
.gitignore
vendored
@@ -43,3 +43,17 @@ output_post/
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
|
|
||||||
|
# 测试与工具缓存
|
||||||
|
.pytest_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
|
||||||
|
# 本地环境变量
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
|
||||||
|
# 项目本地配置(示例文件应保留跟踪)
|
||||||
|
config.yaml
|
||||||
|
|
||||||
|
|||||||
8
config.yaml.example
Executable file
8
config.yaml.example
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
# 零差云控爬虫配置模板
|
||||||
|
# 复制此文件为 config.yaml 并填入实际值
|
||||||
|
|
||||||
|
llm:
|
||||||
|
base_url: "https://yiming.zeroerr.team/v1"
|
||||||
|
api_key: "your-api-key-here"
|
||||||
|
model: "minimax-2.5"
|
||||||
|
max_tokens: 40960
|
||||||
@@ -11,8 +11,9 @@ import requests
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import markdownify
|
import markdownify
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.shared import Inches
|
from docx.shared import Inches, Pt
|
||||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||||
|
from docx.oxml.ns import qn
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
@@ -22,6 +23,16 @@ from .extract_abstract import generate_abstract
|
|||||||
from .post_process import post_process_docx_headings
|
from .post_process import post_process_docx_headings
|
||||||
|
|
||||||
|
|
||||||
|
def _new_doc() -> Document:
|
||||||
|
"""创建 Document 并设置中文字体,确保 WPS/Word 均能正确显示中文。"""
|
||||||
|
doc = Document()
|
||||||
|
normal = doc.styles['Normal']
|
||||||
|
normal.font.name = 'Calibri'
|
||||||
|
normal.font.size = Pt(10.5)
|
||||||
|
normal.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
class BaseCrawler(ABC):
|
class BaseCrawler(ABC):
|
||||||
"""
|
"""
|
||||||
基础爬虫类
|
基础爬虫类
|
||||||
@@ -546,7 +557,7 @@ class BaseCrawler(ABC):
|
|||||||
|
|
||||||
# 保存 Word
|
# 保存 Word
|
||||||
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
|
docx_path = os.path.join(self.output_dir, f"{safe_title}.docx")
|
||||||
doc = Document()
|
doc = _new_doc()
|
||||||
doc.add_heading(title, 0)
|
doc.add_heading(title, 0)
|
||||||
p = doc.add_paragraph()
|
p = doc.add_paragraph()
|
||||||
p.add_run(f"原文链接: {page_data['url']}").italic = True
|
p.add_run(f"原文链接: {page_data['url']}").italic = True
|
||||||
@@ -720,7 +731,7 @@ class BaseCrawler(ABC):
|
|||||||
print(f" Word 文档无需更新: {docx_path}")
|
print(f" Word 文档无需更新: {docx_path}")
|
||||||
else:
|
else:
|
||||||
# 新建Word文档
|
# 新建Word文档
|
||||||
doc = Document()
|
doc = _new_doc()
|
||||||
doc.add_heading(f'{output_dir_name}全集', level=1)
|
doc.add_heading(f'{output_dir_name}全集', level=1)
|
||||||
|
|
||||||
# 添加摘要(只在新建时生成,复用Markdown部分生成的摘要)
|
# 添加摘要(只在新建时生成,复用Markdown部分生成的摘要)
|
||||||
|
|||||||
@@ -3,12 +3,49 @@
|
|||||||
使用大模型生成文档摘要
|
使用大模型生成文档摘要
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
# API 配置
|
|
||||||
API_BASE_URL = "https://yiming.zeroerr.team/v1"
|
def _load_llm_config() -> dict:
|
||||||
API_KEY = "sk-LX1g8KkG61S6eUaVD567C0C187D4452c90F9E6985cDf3586"
|
"""从 config.yaml 加载 LLM 配置,环境变量可覆盖。"""
|
||||||
MODEL = "Yiming"
|
config = {
|
||||||
|
"base_url": "https://yiming.zeroerr.team/v1",
|
||||||
|
"api_key": "",
|
||||||
|
"model": "minimax-2.5",
|
||||||
|
"max_tokens": 40960,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 尝试从项目根目录 config.yaml 读取(与 0209 一致)
|
||||||
|
config_path = os.path.join(os.path.dirname(__file__), "..", "config.yaml")
|
||||||
|
if os.path.exists(config_path):
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
data = yaml.safe_load(f) or {}
|
||||||
|
llm = data.get("llm", {})
|
||||||
|
config.update({k: v for k, v in llm.items() if v})
|
||||||
|
except Exception:
|
||||||
|
# 保持静默降级,继续使用默认值/环境变量
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 环境变量优先级更高
|
||||||
|
config["base_url"] = os.environ.get("ZEROERR_LLM_BASE_URL", config["base_url"])
|
||||||
|
config["api_key"] = os.environ.get("ZEROERR_LLM_API_KEY", config["api_key"])
|
||||||
|
config["model"] = os.environ.get("ZEROERR_LLM_MODEL", config["model"])
|
||||||
|
config["max_tokens"] = int(
|
||||||
|
os.environ.get("ZEROERR_LLM_MAX_TOKENS", config["max_tokens"])
|
||||||
|
)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
_LLM_CONFIG = _load_llm_config()
|
||||||
|
API_BASE_URL = _LLM_CONFIG["base_url"]
|
||||||
|
API_KEY = _LLM_CONFIG["api_key"]
|
||||||
|
MODEL = _LLM_CONFIG["model"]
|
||||||
|
MAX_TOKENS = _LLM_CONFIG["max_tokens"]
|
||||||
|
|
||||||
|
|
||||||
def generate_abstract(all_pages: list[dict], category_name: str, index_url: str = None) -> str:
|
def generate_abstract(all_pages: list[dict], category_name: str, index_url: str = None) -> str:
|
||||||
@@ -26,6 +63,10 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str
|
|||||||
if not all_pages:
|
if not all_pages:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
if not API_KEY:
|
||||||
|
print(" 警告: 未设置 ZEROERR_LLM_API_KEY,跳过摘要生成")
|
||||||
|
return ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 构建文档内容(用于生成摘要)
|
# 构建文档内容(用于生成摘要)
|
||||||
# 只使用标题和部分内容,避免内容过长
|
# 只使用标题和部分内容,避免内容过长
|
||||||
@@ -62,12 +103,13 @@ def generate_abstract(all_pages: list[dict], category_name: str, index_url: str
|
|||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=MODEL,
|
model=MODEL,
|
||||||
temperature=0.3, # 使用较低的温度值,保证摘要的准确性
|
temperature=0.3, # 使用较低的温度值,保证摘要的准确性
|
||||||
messages=[
|
max_tokens=MAX_TOKENS,
|
||||||
{"role": "user", "content": prompt}
|
messages=[{"role": "user", "content": prompt}],
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
abstract_text = response.choices[0].message.content.strip()
|
abstract_text = response.choices[0].message.content.strip()
|
||||||
|
# 过滤掉 <think>...</think> 推理过程
|
||||||
|
abstract_text = re.sub(r"<think>.*?</think>\s*", "", abstract_text, flags=re.DOTALL).strip()
|
||||||
|
|
||||||
# 构建链接列表
|
# 构建链接列表
|
||||||
links_section = "\n\n**相关链接:**\n\n"
|
links_section = "\n\n**相关链接:**\n\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user