初始提交:零差云控官网爬虫项目
This commit is contained in:
134
main.py
Normal file
134
main.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
零差云控官网爬虫主程序
|
||||
爬取网站内容生成 RAGFlow 知识库文档
|
||||
|
||||
使用方法:
|
||||
python main.py # 爬取所有类型
|
||||
python main.py case issue # 只爬取指定类型
|
||||
python main.py --list # 列出所有可用的爬取类型
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from zeroerr_crawler.config import CRAWL_TASKS, OUTPUT_DIR
|
||||
from zeroerr_crawler.base_crawler import StandardCrawler
|
||||
from zeroerr_crawler.product_crawler import ProductCrawler
|
||||
from zeroerr_crawler.utils import ensure_dir
|
||||
|
||||
|
||||
def get_crawler_class(task_key: str):
|
||||
"""
|
||||
根据任务类型返回对应的爬虫类
|
||||
|
||||
Args:
|
||||
task_key: 任务键名
|
||||
|
||||
Returns:
|
||||
爬虫类
|
||||
"""
|
||||
# 产品类页面使用专门的爬虫
|
||||
if task_key in ['erob', 'ecoder', 'tools']:
|
||||
return ProductCrawler
|
||||
return StandardCrawler
|
||||
|
||||
|
||||
def run_tasks(task_keys: list[str]):
|
||||
"""
|
||||
执行指定的爬取任务
|
||||
|
||||
Args:
|
||||
task_keys: 任务键名列表
|
||||
"""
|
||||
ensure_dir(OUTPUT_DIR)
|
||||
|
||||
start_time = time.time()
|
||||
total_tasks = len(task_keys)
|
||||
completed = 0
|
||||
|
||||
print(f"\n{'#'*60}")
|
||||
print(f"# 零差云控官网爬虫")
|
||||
print(f"# 共 {total_tasks} 个任务待执行")
|
||||
print(f"{'#'*60}")
|
||||
|
||||
for key in task_keys:
|
||||
if key not in CRAWL_TASKS:
|
||||
print(f"\n警告: 未知的任务类型 '{key}',跳过")
|
||||
continue
|
||||
|
||||
config = CRAWL_TASKS[key]
|
||||
crawler_class = get_crawler_class(key)
|
||||
|
||||
try:
|
||||
crawler = crawler_class(config)
|
||||
crawler.run()
|
||||
completed += 1
|
||||
except Exception as e:
|
||||
print(f"\n错误: 任务 '{key}' 执行失败: {e}")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f"\n{'#'*60}")
|
||||
print(f"# 爬取完成!")
|
||||
print(f"# 成功: {completed}/{total_tasks} 个任务")
|
||||
print(f"# 耗时: {elapsed:.1f} 秒")
|
||||
print(f"# 输出目录: {OUTPUT_DIR}/")
|
||||
print(f"{'#'*60}")
|
||||
|
||||
|
||||
def list_tasks():
|
||||
"""列出所有可用的爬取任务"""
|
||||
print("\n可用的爬取任务:")
|
||||
print("-" * 40)
|
||||
for key, config in CRAWL_TASKS.items():
|
||||
name = config.get("name", key)
|
||||
print(f" {key:15} - {name}")
|
||||
print("-" * 40)
|
||||
print("\n使用示例:")
|
||||
print(" python main.py # 爬取所有")
|
||||
print(" python main.py case issue # 只爬取应用案例和常见问题")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='零差云控官网爬虫 - 生成 RAGFlow 知识库文档',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
示例:
|
||||
python main.py 爬取所有页面类型
|
||||
python main.py case issue 只爬取应用案例和常见问题
|
||||
python main.py --list 列出所有可用的爬取类型
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'tasks',
|
||||
nargs='*',
|
||||
help='要爬取的任务类型(不指定则爬取所有)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list', '-l',
|
||||
action='store_true',
|
||||
help='列出所有可用的爬取任务类型'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list:
|
||||
list_tasks()
|
||||
return
|
||||
|
||||
# 确定要执行的任务
|
||||
if args.tasks:
|
||||
task_keys = args.tasks
|
||||
else:
|
||||
task_keys = list(CRAWL_TASKS.keys())
|
||||
|
||||
run_tasks(task_keys)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user