134 lines
4.2 KiB
Python
134 lines
4.2 KiB
Python
"""
|
|
爬虫配置文件
|
|
定义所有爬取任务的配置
|
|
"""
|
|
|
|
BASE_URL = "https://www.zeroerr.cn"
|
|
|
|
# 请求头配置
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
# 请求延迟(秒),保护服务器
|
|
REQUEST_DELAY = 0.5
|
|
|
|
# 输出目录
|
|
OUTPUT_DIR = "output"
|
|
|
|
# 爬取任务配置
|
|
# 每个任务定义:名称、索引页、链接筛选规则、内容选择器、标题选择器
|
|
CRAWL_TASKS = {
|
|
# 应用案例
|
|
"case": {
|
|
"name": "应用案例",
|
|
"index_url": "/case/index.html",
|
|
"link_pattern": "/case/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.news_text_p",
|
|
"title_selector": "h1",
|
|
"title_index": 1, # 使用第二个h1标签
|
|
},
|
|
# 常见问题
|
|
"issue": {
|
|
"name": "常见问题",
|
|
"index_url": "/issue/index.html",
|
|
"link_pattern": "/issue/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.news_text_p",
|
|
"title_selector": "h1",
|
|
"title_index": 1,
|
|
},
|
|
# 企业新闻
|
|
"news": {
|
|
"name": "企业新闻",
|
|
"index_url": "/news/index.html",
|
|
"link_pattern": "/news/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.news_text_p",
|
|
"title_selector": "h1",
|
|
"title_index": 1,
|
|
},
|
|
# 认证与资质
|
|
"certification": {
|
|
"name": "认证与资质",
|
|
"index_url": "/Certification/index.html",
|
|
"link_pattern": "/Certification/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.news_text_p",
|
|
"title_selector": "h1",
|
|
"title_index": 1,
|
|
},
|
|
# 机器人关节产品
|
|
"erob": {
|
|
"name": "机器人关节",
|
|
"index_url": "/eRob/index.html",
|
|
"link_pattern": "/eRob/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区
|
|
"title_selector": "h1",
|
|
"title_index": 0,
|
|
},
|
|
# 编码器产品
|
|
"ecoder": {
|
|
"name": "编码器",
|
|
"index_url": "/eCoder/index.html",
|
|
"link_pattern": "/eCoder/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区
|
|
"title_selector": "h1",
|
|
"title_index": 0,
|
|
},
|
|
# 配件
|
|
"tools": {
|
|
"name": "配件",
|
|
"index_url": "/Tools/index.html",
|
|
"link_pattern": "/Tools/",
|
|
"link_suffix": ".html",
|
|
"exclude_patterns": ["index.html"],
|
|
"content_selector": "div.product_text_l,div.product_text_l1,div.product_text,div.news_text_p,div.eLiner_banner,div.web_cable_container", # 多种布局
|
|
"title_selector": "h1,h2", # 部分页面标题用h2
|
|
"title_index": 0,
|
|
},
|
|
# 关于我们等静态页面
|
|
"about": {
|
|
"name": "关于我们",
|
|
"static_pages": [
|
|
"/about/about-us.html",
|
|
"/about/contact-us.html",
|
|
"/about/join-us.html",
|
|
"/about/152.html", # 诚招代理
|
|
],
|
|
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
|
|
"title_selector": "h1,h2",
|
|
"title_index": 0,
|
|
},
|
|
# 服务与支持(单页面,直接抓取内容)
|
|
"support": {
|
|
"name": "服务与支持",
|
|
"static_pages": [
|
|
"/support/", # 主页面包含所有内容
|
|
],
|
|
"content_selector": "div.sidebar_container,div.content,div.content-section,div.news_text_p",
|
|
"title_selector": "h2",
|
|
"title_index": 0,
|
|
},
|
|
# 资料下载(静态页面)
|
|
"download": {
|
|
"name": "资料下载",
|
|
"static_pages": [
|
|
"/download/77.html", # 资料下载说明页
|
|
],
|
|
"content_selector": "div.news_text_p,div.news_text",
|
|
"title_selector": "h1",
|
|
"title_index": 0,
|
|
},
|
|
}
|
|
|