更新爬虫方案文档,增加服务与支持-详细页面的输出信息;优化基础爬虫类,增强标题提取和内容去重逻辑;根据doc2md.py调整图片处理逻辑以改善Word文档生成效果。

This commit is contained in:
oy2020
2026-01-31 09:30:33 +08:00
parent 3670129972
commit 3c625d1c3a
5 changed files with 332 additions and 58 deletions

View File

@@ -102,8 +102,6 @@ CRAWL_TASKS = {
"static_pages": [
"/about/about-us.html",
"/about/contact-us.html",
"/about/join-us.html",
"/about/152.html", # 诚招代理
],
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
"title_selector": "h1,h2",
@@ -129,5 +127,17 @@ CRAWL_TASKS = {
"title_selector": "h1",
"title_index": 0,
},
# 服务与支持详细页面(从索引页提取)
"service_detail": {
"name": "服务与支持-详细页面",
"output_dir": "服务与支持", # 输出到同一个目录
"index_url": "/Service/index.html",
"link_pattern": "/Service/",
"link_suffix": ".html",
"exclude_patterns": ["index.html"],
"content_selector": "div.news_text_p,div.news_text,div.content,div.content-section", # 多种布局支持
"title_selector": "h1",
"title_index": 1,
},
}