135 lines
3.4 KiB
Python
135 lines
3.4 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
零差云控官网爬虫主程序
|
|
爬取网站内容生成 RAGFlow 知识库文档
|
|
|
|
使用方法:
|
|
python main.py # 爬取所有类型
|
|
python main.py case issue # 只爬取指定类型
|
|
python main.py --list # 列出所有可用的爬取类型
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from zeroerr_crawler.config import CRAWL_TASKS, OUTPUT_DIR
|
|
from zeroerr_crawler.base_crawler import StandardCrawler
|
|
from zeroerr_crawler.product_crawler import ProductCrawler
|
|
from zeroerr_crawler.utils import ensure_dir
|
|
|
|
|
|
def get_crawler_class(task_key: str):
|
|
"""
|
|
根据任务类型返回对应的爬虫类
|
|
|
|
Args:
|
|
task_key: 任务键名
|
|
|
|
Returns:
|
|
爬虫类
|
|
"""
|
|
# 产品类页面使用专门的爬虫
|
|
if task_key in ['erob', 'ecoder', 'tools']:
|
|
return ProductCrawler
|
|
return StandardCrawler
|
|
|
|
|
|
def run_tasks(task_keys: list[str]):
|
|
"""
|
|
执行指定的爬取任务
|
|
|
|
Args:
|
|
task_keys: 任务键名列表
|
|
"""
|
|
ensure_dir(OUTPUT_DIR)
|
|
|
|
start_time = time.time()
|
|
total_tasks = len(task_keys)
|
|
completed = 0
|
|
|
|
print(f"\n{'#'*60}")
|
|
print(f"# 零差云控官网爬虫")
|
|
print(f"# 共 {total_tasks} 个任务待执行")
|
|
print(f"{'#'*60}")
|
|
|
|
for key in task_keys:
|
|
if key not in CRAWL_TASKS:
|
|
print(f"\n警告: 未知的任务类型 '{key}',跳过")
|
|
continue
|
|
|
|
config = CRAWL_TASKS[key]
|
|
crawler_class = get_crawler_class(key)
|
|
|
|
try:
|
|
crawler = crawler_class(config)
|
|
crawler.run()
|
|
completed += 1
|
|
except Exception as e:
|
|
print(f"\n错误: 任务 '{key}' 执行失败: {e}")
|
|
|
|
elapsed = time.time() - start_time
|
|
print(f"\n{'#'*60}")
|
|
print(f"# 爬取完成!")
|
|
print(f"# 成功: {completed}/{total_tasks} 个任务")
|
|
print(f"# 耗时: {elapsed:.1f} 秒")
|
|
print(f"# 输出目录: {OUTPUT_DIR}/")
|
|
print(f"{'#'*60}")
|
|
|
|
|
|
def list_tasks():
|
|
"""列出所有可用的爬取任务"""
|
|
print("\n可用的爬取任务:")
|
|
print("-" * 40)
|
|
for key, config in CRAWL_TASKS.items():
|
|
name = config.get("name", key)
|
|
print(f" {key:15} - {name}")
|
|
print("-" * 40)
|
|
print("\n使用示例:")
|
|
print(" python main.py # 爬取所有")
|
|
print(" python main.py case issue # 只爬取应用案例和常见问题")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='零差云控官网爬虫 - 生成 RAGFlow 知识库文档',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog='''
|
|
示例:
|
|
python main.py 爬取所有页面类型
|
|
python main.py case issue 只爬取应用案例和常见问题
|
|
python main.py --list 列出所有可用的爬取类型
|
|
'''
|
|
)
|
|
|
|
parser.add_argument(
|
|
'tasks',
|
|
nargs='*',
|
|
help='要爬取的任务类型(不指定则爬取所有)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--list', '-l',
|
|
action='store_true',
|
|
help='列出所有可用的爬取任务类型'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.list:
|
|
list_tasks()
|
|
return
|
|
|
|
# 确定要执行的任务
|
|
if args.tasks:
|
|
task_keys = args.tasks
|
|
else:
|
|
task_keys = list(CRAWL_TASKS.keys())
|
|
|
|
run_tasks(task_keys)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|