更新爬虫方案文档,增加摘要提取模块以生成文档摘要;优化基础爬虫类的标题提取逻辑,支持多个选择器,调整内容处理逻辑以去除重复标题。
This commit is contained in:
@@ -45,24 +45,97 @@ class ProductCrawler(BaseCrawler):
|
||||
提取产品页面标题
|
||||
产品页面标题可能在不同位置
|
||||
"""
|
||||
# 尝试从面包屑导航后的第一个 h1
|
||||
h1_tags = soup.find_all('h1')
|
||||
for h1 in h1_tags:
|
||||
text = h1.get_text(strip=True)
|
||||
# 跳过网站名称
|
||||
if '零差云控' in text or '零误差' in text:
|
||||
continue
|
||||
if text:
|
||||
return text
|
||||
# 优先使用配置中的选择器(支持 h1, h2 等)
|
||||
selector = self.config.get("title_selector", "h1")
|
||||
index = self.config.get("title_index", 0)
|
||||
|
||||
# 从 URL 提取
|
||||
return url.split('/')[-1].replace('.html', '')
|
||||
# 支持多个选择器,用逗号分隔
|
||||
selectors = [s.strip() for s in selector.split(',')]
|
||||
|
||||
# 收集所有匹配的标签
|
||||
all_tags = []
|
||||
for sel in selectors:
|
||||
# 对于简单的标签名(如 "h1", "h2"),直接查找
|
||||
if sel in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
|
||||
found_tags = soup.find_all(sel)
|
||||
all_tags.extend(found_tags)
|
||||
else:
|
||||
# 对于其他选择器,尝试查找
|
||||
found_tags = soup.find_all(sel)
|
||||
all_tags.extend(found_tags)
|
||||
|
||||
# 优先从配置的选择器提取
|
||||
if all_tags and len(all_tags) > index:
|
||||
title = all_tags[index].get_text(strip=True)
|
||||
# 跳过网站名称
|
||||
if title and '零差云控' not in title and '零误差' not in title:
|
||||
return title
|
||||
elif all_tags:
|
||||
# 如果指定索引的标签被跳过,尝试其他标签
|
||||
for tag in all_tags:
|
||||
title = tag.get_text(strip=True)
|
||||
# 跳过网站名称
|
||||
if title and '零差云控' not in title and '零误差' not in title:
|
||||
return title
|
||||
|
||||
# 尝试从页面 title 标签提取
|
||||
title_tag = soup.find('title')
|
||||
if title_tag:
|
||||
title = title_tag.get_text(strip=True)
|
||||
# 移除网站名称后缀(如 " - 零差云控")
|
||||
if ' - ' in title:
|
||||
title = title.split(' - ')[0].strip()
|
||||
if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
|
||||
return title
|
||||
|
||||
# 最后从 URL 提取
|
||||
url_part = url.split('/')[-1].replace('.html', '')
|
||||
# 将连字符替换为空格,并首字母大写
|
||||
if '-' in url_part:
|
||||
url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
|
||||
return url_part
|
||||
|
||||
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str):
|
||||
def add_content_to_docx(self, doc: Document, content: BeautifulSoup, output_dir: str, page_title: str = None):
|
||||
"""
|
||||
将产品内容添加到 Word 文档
|
||||
针对产品页面的表格等进行优化处理
|
||||
|
||||
Args:
|
||||
doc: Document 对象
|
||||
content: 内容区域
|
||||
output_dir: 输出目录(用于解析图片路径)
|
||||
page_title: 页面标题(如果提供,会跳过内容中与标题重复的h1/h2标签或包含标题的段落)
|
||||
"""
|
||||
# 如果提供了页面标题,创建内容副本并移除重复的标题元素
|
||||
if page_title:
|
||||
content = BeautifulSoup(str(content), 'html.parser')
|
||||
|
||||
# 移除与标题完全相同的第一个h1
|
||||
first_h1 = content.find('h1')
|
||||
if first_h1:
|
||||
h1_text = first_h1.get_text(strip=True)
|
||||
if h1_text == page_title:
|
||||
first_h1.decompose()
|
||||
|
||||
# 移除与标题完全相同的第一个h2
|
||||
first_h2 = content.find('h2')
|
||||
if first_h2:
|
||||
h2_text = first_h2.get_text(strip=True)
|
||||
if h2_text == page_title:
|
||||
first_h2.decompose()
|
||||
|
||||
# 检查标题是否包含"型号:"前缀,如果是,也移除内容中只包含产品名称的h2
|
||||
# 例如:标题是"型号:eCoder11",内容中有"eCoder11"的h2
|
||||
if '型号:' in page_title or '型号:' in page_title:
|
||||
product_name = page_title.replace('型号:', '').replace('型号:', '').strip()
|
||||
if product_name:
|
||||
# 查找第一个只包含产品名称的h2
|
||||
for h2 in content.find_all('h2'):
|
||||
h2_text = h2.get_text(strip=True)
|
||||
if h2_text == product_name:
|
||||
h2.decompose()
|
||||
break # 只移除第一个匹配的
|
||||
|
||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table', 'div']):
|
||||
# 跳过嵌套元素
|
||||
if element.find_parent(['table', 'li']):
|
||||
|
||||
Reference in New Issue
Block a user