Files
crawl4zeroerr/zeroerr_crawler/config.py
2026-01-29 17:29:14 +08:00

134 lines
4.2 KiB
Python

"""
爬虫配置文件
定义所有爬取任务的配置
"""
BASE_URL = "https://www.zeroerr.cn"
# 请求头配置
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
# 请求延迟(秒),保护服务器
REQUEST_DELAY = 0.5
# 输出目录
OUTPUT_DIR = "output"
# 爬取任务配置
# 每个任务定义:名称、索引页、链接筛选规则、内容选择器、标题选择器
CRAWL_TASKS = {
# 应用案例
"case": {
"name": "应用案例",
"index_url": "/case/index.html",
"link_pattern": "/case/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.news_text_p",
"title_selector": "h1",
"title_index": 1, # 使用第二个h1标签
},
# 常见问题
"issue": {
"name": "常见问题",
"index_url": "/issue/index.html",
"link_pattern": "/issue/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.news_text_p",
"title_selector": "h1",
"title_index": 1,
},
# 企业新闻
"news": {
"name": "企业新闻",
"index_url": "/news/index.html",
"link_pattern": "/news/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.news_text_p",
"title_selector": "h1",
"title_index": 1,
},
# 认证与资质
"certification": {
"name": "认证与资质",
"index_url": "/Certification/index.html",
"link_pattern": "/Certification/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.news_text_p",
"title_selector": "h1",
"title_index": 1,
},
# 机器人关节产品
"erob": {
"name": "机器人关节",
"index_url": "/eRob/index.html",
"link_pattern": "/eRob/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区
"title_selector": "h1",
"title_index": 0,
},
# 编码器产品
"ecoder": {
"name": "编码器",
"index_url": "/eCoder/index.html",
"link_pattern": "/eCoder/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区
"title_selector": "h1",
"title_index": 0,
},
# 配件
"tools": {
"name": "配件",
"index_url": "/Tools/index.html",
"link_pattern": "/Tools/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.product_text_l,div.product_text_l1,div.product_text,div.news_text_p,div.eLiner_banner,div.web_cable_container", # 多种布局
"title_selector": "h1,h2", # 部分页面标题用h2
"title_index": 0,
},
# 关于我们等静态页面
"about": {
"name": "关于我们",
"static_pages": [
"/about/about-us.html",
"/about/contact-us.html",
"/about/join-us.html",
"/about/152.html", # 诚招代理
],
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
"title_selector": "h1,h2",
"title_index": 0,
},
# 服务与支持(单页面,直接抓取内容)
"support": {
"name": "服务与支持",
"static_pages": [
"/support/", # 主页面包含所有内容
],
"content_selector": "div.sidebar_container,div.content,div.content-section,div.news_text_p",
"title_selector": "h2",
"title_index": 0,
},
# 资料下载(静态页面)
"download": {
"name": "资料下载",
"static_pages": [
"/download/77.html", # 资料下载说明页
],
"content_selector": "div.news_text_p,div.news_text",
"title_selector": "h1",
"title_index": 0,
},
}