更新爬虫方案文档,增加摘要提取模块以生成文档摘要;优化基础爬虫类的标题提取逻辑,支持多个选择器,调整内容处理逻辑以去除重复标题。
This commit is contained in:
91
zeroerr_crawler/extract_abstract.py
Normal file
91
zeroerr_crawler/extract_abstract.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
摘要提取模块
|
||||
使用大模型生成文档摘要
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# API 配置
|
||||
API_BASE_URL = "https://yiming.zeroerr.team/v1"
|
||||
API_KEY = "sk-LX1g8KkG61S6eUaVD567C0C187D4452c90F9E6985cDf3586"
|
||||
MODEL = "Yiming"
|
||||
|
||||
|
||||
def generate_abstract(all_pages: list[dict], category_name: str) -> str:
|
||||
"""
|
||||
使用大模型生成文档摘要
|
||||
|
||||
Args:
|
||||
all_pages: 所有页面数据列表,每个元素包含 'title', 'url', 'markdown' 等字段
|
||||
category_name: 文档类别名称(如"应用案例")
|
||||
|
||||
Returns:
|
||||
摘要文本(Markdown格式),包含摘要内容和链接列表
|
||||
"""
|
||||
if not all_pages:
|
||||
return ""
|
||||
|
||||
try:
|
||||
# 构建文档内容(用于生成摘要)
|
||||
# 只使用标题和部分内容,避免内容过长
|
||||
content_parts = []
|
||||
for page in all_pages:
|
||||
title = page.get('title', '')
|
||||
markdown = page.get('markdown', '')
|
||||
# 只取前500字符的内容,避免输入过长
|
||||
content_preview = markdown[:500] if len(markdown) > 500 else markdown
|
||||
content_parts.append(f"标题:{title}\n内容预览:{content_preview}")
|
||||
|
||||
document_content = "\n\n".join(content_parts)
|
||||
|
||||
# 构建提示词
|
||||
prompt = f"""面向客户售前咨询,请为以下"{category_name}"类别的文档集合生成一个简洁的摘要。
|
||||
|
||||
文档内容:
|
||||
{document_content}
|
||||
|
||||
要求:
|
||||
1. 摘要应概括该页面的主题和主要内容
|
||||
2. 摘要长度控制在100-200字之间
|
||||
3. 使用简洁、专业的语言
|
||||
4. 突出该页面主题的价值和特点
|
||||
|
||||
请直接输出摘要内容,不要包含其他说明文字。"""
|
||||
|
||||
# 调用大模型API
|
||||
client = OpenAI(
|
||||
base_url=API_BASE_URL,
|
||||
api_key=API_KEY
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL,
|
||||
temperature=0.3, # 使用较低的温度值,保证摘要的准确性
|
||||
messages=[
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
)
|
||||
|
||||
abstract_text = response.choices[0].message.content.strip()
|
||||
|
||||
# 构建链接列表
|
||||
links_section = "\n\n**相关链接:**\n\n"
|
||||
for i, page in enumerate(all_pages, 1):
|
||||
title = page.get('title', '未命名')
|
||||
url = page.get('url', '')
|
||||
links_section += f"{i}. [{title}]({url})\n"
|
||||
|
||||
# 组合摘要和链接
|
||||
result = f"{abstract_text}{links_section}"
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f" 警告: 生成摘要失败: {e}")
|
||||
# 如果生成摘要失败,至少返回链接列表
|
||||
links_section = "\n\n**相关链接:**\n\n"
|
||||
for i, page in enumerate(all_pages, 1):
|
||||
title = page.get('title', '未命名')
|
||||
url = page.get('url', '')
|
||||
links_section += f"{i}. [{title}]({url})\n"
|
||||
return links_section
|
||||
Reference in New Issue
Block a user