""" 爬虫配置文件 定义所有爬取任务的配置 """ BASE_URL = "https://www.zeroerr.cn" # 请求头配置 HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } # 请求延迟(秒),保护服务器 REQUEST_DELAY = 0.5 # 输出目录 OUTPUT_DIR = "output" # 爬取任务配置 # 每个任务定义:名称、索引页、链接筛选规则、内容选择器、标题选择器 CRAWL_TASKS = { # 应用案例 "case": { "name": "应用案例", "index_url": "/case/index.html", "link_pattern": "/case/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.news_text_p", "title_selector": "h1", "title_index": 1, # 使用第二个h1标签 }, # 常见问题 "issue": { "name": "常见问题", "index_url": "/issue/index.html", "link_pattern": "/issue/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.news_text_p", "title_selector": "h1", "title_index": 1, }, # 企业新闻 "news": { "name": "企业新闻", "index_url": "/news/index.html", "link_pattern": "/news/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.news_text_p", "title_selector": "h1", "title_index": 1, }, # 认证与资质 "certification": { "name": "认证与资质", "index_url": "/Certification/index.html", "link_pattern": "/Certification/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.news_text_p", "title_selector": "h1", "title_index": 1, }, # 机器人关节产品 "erob": { "name": "机器人关节", "index_url": "/eRob/index.html", "link_pattern": "/eRob/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区 "title_selector": "h1", "title_index": 0, }, # 编码器产品 "ecoder": { "name": "编码器", "index_url": "/eCoder/index.html", "link_pattern": "/eCoder/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.product_text_l,div.product_text", # 产品页面左侧/整体内容区 "title_selector": "h1", "title_index": 0, }, # 配件 "tools": { "name": "配件", "index_url": "/Tools/index.html", "link_pattern": "/Tools/", "link_suffix": ".html", "exclude_patterns": ["index.html"], "content_selector": "div.product_text_l,div.product_text_l1,div.product_text,div.news_text_p,div.eLiner_banner,div.web_cable_container", # 多种布局 "title_selector": "h1,h2", # 部分页面标题用h2 "title_index": 0, }, # 关于我们等静态页面 "about": { "name": "关于我们", "static_pages": [ "/about/about-us.html", "/about/contact-us.html", "/about/join-us.html", "/about/152.html", # 诚招代理 ], "content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局 "title_selector": "h1,h2", "title_index": 0, }, # 服务与支持(单页面,直接抓取内容) "support": { "name": "服务与支持", "static_pages": [ "/support/", # 主页面包含所有内容 ], "content_selector": "div.sidebar_container,div.content,div.content-section,div.news_text_p", "title_selector": "h2", "title_index": 0, }, # 资料下载(静态页面) "download": { "name": "资料下载", "static_pages": [ "/download/77.html", # 资料下载说明页 ], "content_selector": "div.news_text_p,div.news_text", "title_selector": "h1", "title_index": 0, }, }