更新爬虫方案文档,增加服务与支持-详细页面的输出信息;优化基础爬虫类,增强标题提取和内容去重逻辑;根据doc2md.py调整图片处理逻辑以改善Word文档生成效果。
This commit is contained in:
@@ -102,8 +102,6 @@ CRAWL_TASKS = {
|
||||
"static_pages": [
|
||||
"/about/about-us.html",
|
||||
"/about/contact-us.html",
|
||||
"/about/join-us.html",
|
||||
"/about/152.html", # 诚招代理
|
||||
],
|
||||
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
|
||||
"title_selector": "h1,h2",
|
||||
@@ -129,5 +127,17 @@ CRAWL_TASKS = {
|
||||
"title_selector": "h1",
|
||||
"title_index": 0,
|
||||
},
|
||||
# 服务与支持详细页面(从索引页提取)
|
||||
"service_detail": {
|
||||
"name": "服务与支持-详细页面",
|
||||
"output_dir": "服务与支持", # 输出到同一个目录
|
||||
"index_url": "/Service/index.html",
|
||||
"link_pattern": "/Service/",
|
||||
"link_suffix": ".html",
|
||||
"exclude_patterns": ["index.html"],
|
||||
"content_selector": "div.news_text_p,div.news_text,div.content,div.content-section", # 多种布局支持
|
||||
"title_selector": "h1",
|
||||
"title_index": 1,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user