更新爬虫方案文档,增加服务与支持-详细页面的输出信息;优化基础爬虫类,增强标题提取和内容去重逻辑;根据doc2md.py调整图片处理逻辑以改善Word文档生成效果。
This commit is contained in:
@@ -27,14 +27,17 @@ crawl/
|
|||||||
| 常见问题 | `python main.py issue` | 32篇 | ✅ 完成 |
|
| 常见问题 | `python main.py issue` | 32篇 | ✅ 完成 |
|
||||||
| 企业新闻 | `python main.py news` | 11篇 | ✅ 完成 |
|
| 企业新闻 | `python main.py news` | 11篇 | ✅ 完成 |
|
||||||
| 认证与资质 | `python main.py certification` | 10篇 | ✅ 完成 |
|
| 认证与资质 | `python main.py certification` | 10篇 | ✅ 完成 |
|
||||||
| 机器人关节 | `python main.py erob` | 11篇 | ✅ 完成 |
|
| 机器人关节 | `python main.py erob` | 12篇 | ✅ 完成 |
|
||||||
| 编码器 | `python main.py ecoder` | 7篇 | ✅ 完成 |
|
| 编码器 | `python main.py ecoder` | 7篇 | ✅ 完成 |
|
||||||
| 配件 | `python main.py tools` | 13篇 | ✅ 完成 |
|
| 配件 | `python main.py tools` | 13篇 | ✅ 完成 |
|
||||||
| 关于我们 | `python main.py about` | 2篇 | ✅ 完成 |
|
| 关于我们 | `python main.py about` | 2篇 | ✅ 完成 |
|
||||||
| 服务与支持 | `python main.py support` | 1篇 | ✅ 完成 |
|
| 服务与支持 | `python main.py support` | 1篇 | ✅ 完成 |
|
||||||
|
| 服务与支持-详细页面 | `python main.py service_detail` | 7篇 | ✅ 完成 |
|
||||||
| 资料下载 | `python main.py download` | 1篇 | ✅ 完成 |
|
| 资料下载 | `python main.py download` | 1篇 | ✅ 完成 |
|
||||||
|
|
||||||
**总计: 102 篇文章**
|
**总计: 110 篇文章**
|
||||||
|
|
||||||
|
> 注:服务与支持和服务与支持-详细页面合并输出到同一个目录 `output/服务与支持/`
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|
||||||
|
|||||||
@@ -5,3 +5,19 @@ markdownify>=0.11.0
|
|||||||
python-docx>=0.8.11
|
python-docx>=0.8.11
|
||||||
lxml>=4.9.0
|
lxml>=4.9.0
|
||||||
|
|
||||||
|
# doc2md.py 依赖
|
||||||
|
Pillow>=9.0.0
|
||||||
|
matplotlib>=3.5.0 # 可选:用于渲染 LaTeX 公式
|
||||||
|
# wand>=0.6.0 # 可选:用于 WMF/EMF 转换(需要系统安装 ImageMagick)
|
||||||
|
# html2image>=2.0.0 # 可选:用于表格渲染为图片
|
||||||
|
|
||||||
|
# test_llm.py 依赖 - RAG 方案
|
||||||
|
openai>=1.0.0
|
||||||
|
langchain>=0.1.0
|
||||||
|
langchain-openai>=0.1.0
|
||||||
|
langchain-community>=0.0.20
|
||||||
|
faiss-cpu>=1.7.4
|
||||||
|
tiktoken>=0.5.0
|
||||||
|
sentence-transformers>=2.2.0
|
||||||
|
torch>=2.0.0
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import copy
|
import copy
|
||||||
|
import re
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import markdownify
|
import markdownify
|
||||||
@@ -37,8 +38,12 @@ class BaseCrawler(ABC):
|
|||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.session.headers.update(HEADERS)
|
self.session.headers.update(HEADERS)
|
||||||
|
|
||||||
# 输出目录
|
# 输出目录(支持自定义)
|
||||||
self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(self.name))
|
if "output_dir" in task_config:
|
||||||
|
output_dir_name = task_config["output_dir"]
|
||||||
|
else:
|
||||||
|
output_dir_name = self.name
|
||||||
|
self.output_dir = os.path.join(OUTPUT_DIR, safe_filename(output_dir_name))
|
||||||
self.images_dir = os.path.join(self.output_dir, "images")
|
self.images_dir = os.path.join(self.output_dir, "images")
|
||||||
ensure_dir(self.output_dir)
|
ensure_dir(self.output_dir)
|
||||||
ensure_dir(self.images_dir)
|
ensure_dir(self.images_dir)
|
||||||
@@ -123,14 +128,41 @@ class BaseCrawler(ABC):
|
|||||||
selector = self.config.get("title_selector", "h1")
|
selector = self.config.get("title_selector", "h1")
|
||||||
index = self.config.get("title_index", 0)
|
index = self.config.get("title_index", 0)
|
||||||
|
|
||||||
|
# 优先从配置的选择器提取
|
||||||
tags = soup.find_all(selector)
|
tags = soup.find_all(selector)
|
||||||
if tags and len(tags) > index:
|
if tags and len(tags) > index:
|
||||||
return tags[index].get_text(strip=True)
|
title = tags[index].get_text(strip=True)
|
||||||
|
if title:
|
||||||
|
return title
|
||||||
elif tags:
|
elif tags:
|
||||||
return tags[0].get_text(strip=True)
|
title = tags[0].get_text(strip=True)
|
||||||
else:
|
if title:
|
||||||
# 使用URL最后一部分作为标题
|
return title
|
||||||
return url.split('/')[-1].replace('.html', '')
|
|
||||||
|
# 尝试从页面 title 标签提取
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
if title_tag:
|
||||||
|
title = title_tag.get_text(strip=True)
|
||||||
|
# 移除网站名称后缀(如 " - 零差云控")
|
||||||
|
if ' - ' in title:
|
||||||
|
title = title.split(' - ')[0].strip()
|
||||||
|
if title and title.lower() not in ['about-us', 'contact-us', 'join-us']:
|
||||||
|
return title
|
||||||
|
|
||||||
|
# 尝试从 h1 标签提取(即使不在配置的选择器中)
|
||||||
|
h1_tags = soup.find_all('h1')
|
||||||
|
for h1 in h1_tags:
|
||||||
|
title = h1.get_text(strip=True)
|
||||||
|
# 跳过网站名称
|
||||||
|
if title and '零差云控' not in title and '零误差' not in title:
|
||||||
|
return title
|
||||||
|
|
||||||
|
# 最后使用URL最后一部分作为标题,但进行美化
|
||||||
|
url_part = url.split('/')[-1].replace('.html', '')
|
||||||
|
# 将连字符替换为空格,并首字母大写
|
||||||
|
if '-' in url_part:
|
||||||
|
url_part = ' '.join(word.capitalize() for word in url_part.split('-'))
|
||||||
|
return url_part
|
||||||
|
|
||||||
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
|
def extract_content(self, soup: BeautifulSoup) -> BeautifulSoup | None:
|
||||||
"""
|
"""
|
||||||
@@ -155,6 +187,7 @@ class BaseCrawler(ABC):
|
|||||||
# class 选择器
|
# class 选择器
|
||||||
tag, class_name = sel.split('.', 1)
|
tag, class_name = sel.split('.', 1)
|
||||||
tag = tag if tag else 'div'
|
tag = tag if tag else 'div'
|
||||||
|
# 使用 find 只匹配第一个元素,避免重复
|
||||||
content = soup.find(tag, class_=class_name)
|
content = soup.find(tag, class_=class_name)
|
||||||
else:
|
else:
|
||||||
content = soup.find(sel)
|
content = soup.find(sel)
|
||||||
@@ -170,9 +203,47 @@ class BaseCrawler(ABC):
|
|||||||
if len(all_contents) == 1:
|
if len(all_contents) == 1:
|
||||||
return all_contents[0]
|
return all_contents[0]
|
||||||
|
|
||||||
|
# 去重:移除嵌套或重复的内容块
|
||||||
|
unique_contents = []
|
||||||
|
seen_texts = set() # 用于记录已见过的文本内容
|
||||||
|
|
||||||
|
for content in all_contents:
|
||||||
|
is_duplicate = False
|
||||||
|
content_text = content.get_text(strip=True)
|
||||||
|
|
||||||
|
# 跳过空内容
|
||||||
|
if not content_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 检查是否被其他内容块包含(是其他块的子元素)
|
||||||
|
for other in all_contents:
|
||||||
|
if content is other:
|
||||||
|
continue
|
||||||
|
# 检查当前内容是否是另一个内容块的子元素
|
||||||
|
parent = content.find_parent()
|
||||||
|
while parent:
|
||||||
|
if parent is other:
|
||||||
|
is_duplicate = True
|
||||||
|
break
|
||||||
|
parent = parent.find_parent()
|
||||||
|
if is_duplicate:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 如果内容文本完全相同,只保留第一个
|
||||||
|
if not is_duplicate and content_text in seen_texts:
|
||||||
|
is_duplicate = True
|
||||||
|
|
||||||
|
if not is_duplicate:
|
||||||
|
unique_contents.append(content)
|
||||||
|
seen_texts.add(content_text)
|
||||||
|
|
||||||
|
# 如果去重后只剩一个,直接返回
|
||||||
|
if len(unique_contents) == 1:
|
||||||
|
return unique_contents[0]
|
||||||
|
|
||||||
# 合并多个内容区域到一个容器
|
# 合并多个内容区域到一个容器
|
||||||
combined = soup.new_tag('div')
|
combined = soup.new_tag('div')
|
||||||
for content in all_contents:
|
for content in unique_contents:
|
||||||
# 深拷贝内容以避免从原DOM中移除
|
# 深拷贝内容以避免从原DOM中移除
|
||||||
combined.append(copy.deepcopy(content))
|
combined.append(copy.deepcopy(content))
|
||||||
|
|
||||||
@@ -192,6 +263,35 @@ class BaseCrawler(ABC):
|
|||||||
for tag in content(['script', 'style']):
|
for tag in content(['script', 'style']):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
|
# 移除导航链接、空链接、锚点链接
|
||||||
|
for a in content.find_all('a', href=True):
|
||||||
|
href = a.get('href', '')
|
||||||
|
# 移除空链接、锚点链接、JavaScript 链接
|
||||||
|
if not href or href.startswith('#') or href.startswith('javascript:'):
|
||||||
|
# 保留链接文本,移除链接标签
|
||||||
|
a.unwrap()
|
||||||
|
# 移除导航菜单中的链接(通常包含特定 class 或 id)
|
||||||
|
elif a.find_parent(['nav', 'menu', 'navigation']):
|
||||||
|
a.decompose()
|
||||||
|
|
||||||
|
# 移除空的 div、span 等标签(只包含空白字符)
|
||||||
|
for tag in content.find_all(['div', 'span', 'p']):
|
||||||
|
text = tag.get_text(strip=True)
|
||||||
|
if not text and not tag.find_all(['img', 'table']):
|
||||||
|
# 如果没有文本内容且没有图片/表格,移除
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
# 移除注释
|
||||||
|
from bs4 import Comment
|
||||||
|
for comment in content.find_all(string=lambda text: isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
# 清理多余的空白字符
|
||||||
|
for tag in content.find_all(['p', 'div', 'span']):
|
||||||
|
if tag.string:
|
||||||
|
# 清理段落内的多余空白
|
||||||
|
tag.string = ' '.join(tag.string.split())
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
|
def process_images(self, content: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
|
||||||
@@ -249,7 +349,8 @@ class BaseCrawler(ABC):
|
|||||||
content: 内容区域
|
content: 内容区域
|
||||||
output_dir: 输出目录(用于解析图片路径)
|
output_dir: 输出目录(用于解析图片路径)
|
||||||
"""
|
"""
|
||||||
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'table']):
|
# 按文档顺序处理元素,保持列表的连续性
|
||||||
|
for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ul', 'ol', 'table']):
|
||||||
if element.name == 'img':
|
if element.name == 'img':
|
||||||
src = element.get('src', '')
|
src = element.get('src', '')
|
||||||
# 尝试获取本地图片路径
|
# 尝试获取本地图片路径
|
||||||
@@ -260,29 +361,58 @@ class BaseCrawler(ABC):
|
|||||||
|
|
||||||
if os.path.exists(local_path):
|
if os.path.exists(local_path):
|
||||||
try:
|
try:
|
||||||
|
# 图片前添加空行
|
||||||
|
doc.add_paragraph()
|
||||||
doc.add_picture(local_path, width=Inches(5))
|
doc.add_picture(local_path, width=Inches(5))
|
||||||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
# 图片后添加空行
|
||||||
|
doc.add_paragraph()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" Word插入图片失败: {local_path} - {e}")
|
print(f" Word插入图片失败: {local_path} - {e}")
|
||||||
|
|
||||||
elif element.name.startswith('h'):
|
elif element.name.startswith('h'):
|
||||||
text = element.get_text(strip=True)
|
text = element.get_text(strip=True)
|
||||||
if text:
|
if text:
|
||||||
|
# HTML h1-h6 直接映射到 Word Heading 1-6
|
||||||
|
# 限制在 1-9 范围内(Word 支持的最大标题级别)
|
||||||
level = int(element.name[1])
|
level = int(element.name[1])
|
||||||
doc.add_heading(text, level=min(level + 1, 9))
|
doc.add_heading(text, level=min(level, 9))
|
||||||
|
|
||||||
|
elif element.name in ['ul', 'ol']:
|
||||||
|
# 列表容器,跳过(列表项会单独处理)
|
||||||
|
continue
|
||||||
|
|
||||||
elif element.name == 'li':
|
elif element.name == 'li':
|
||||||
text = element.get_text(strip=True)
|
text = element.get_text(strip=True)
|
||||||
if text:
|
if text:
|
||||||
doc.add_paragraph(text, style='List Bullet')
|
# 检查父元素是 ul 还是 ol
|
||||||
elif element.name == 'table':
|
parent = element.find_parent(['ul', 'ol'])
|
||||||
# 简单处理表格,提取文本
|
is_ordered = parent and parent.name == 'ol'
|
||||||
for row in element.find_all('tr'):
|
|
||||||
cells = row.find_all(['td', 'th'])
|
# 使用列表样式
|
||||||
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
|
if is_ordered:
|
||||||
if row_text.strip():
|
doc.add_paragraph(text, style='List Number')
|
||||||
doc.add_paragraph(row_text)
|
|
||||||
else:
|
else:
|
||||||
|
doc.add_paragraph(text, style='List Bullet')
|
||||||
|
|
||||||
|
elif element.name == 'table':
|
||||||
|
# 处理表格,创建 Word 表格结构(便于 doc2md.py 解析)
|
||||||
|
self._add_table_to_docx(doc, element)
|
||||||
|
|
||||||
|
elif element.name == 'p':
|
||||||
text = element.get_text(strip=True)
|
text = element.get_text(strip=True)
|
||||||
if text:
|
if text:
|
||||||
|
# 跳过空段落和只包含空白字符的段落
|
||||||
|
if text.strip():
|
||||||
|
# 检查是否是列表项(某些网站用 p 标签包裹列表项)
|
||||||
|
parent = element.find_parent(['ul', 'ol'])
|
||||||
|
if parent:
|
||||||
|
is_ordered = parent.name == 'ol'
|
||||||
|
if is_ordered:
|
||||||
|
doc.add_paragraph(text, style='List Number')
|
||||||
|
else:
|
||||||
|
doc.add_paragraph(text, style='List Bullet')
|
||||||
|
else:
|
||||||
doc.add_paragraph(text)
|
doc.add_paragraph(text)
|
||||||
|
|
||||||
def crawl_page(self, url: str) -> dict | None:
|
def crawl_page(self, url: str) -> dict | None:
|
||||||
@@ -357,6 +487,7 @@ class BaseCrawler(ABC):
|
|||||||
def save_combined_documents(self, all_pages: list[dict]):
|
def save_combined_documents(self, all_pages: list[dict]):
|
||||||
"""
|
"""
|
||||||
将所有页面汇总保存为一个 md 和 docx 文件
|
将所有页面汇总保存为一个 md 和 docx 文件
|
||||||
|
如果文件已存在,会追加内容并去重(基于URL)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
all_pages: 所有页面数据列表
|
all_pages: 所有页面数据列表
|
||||||
@@ -364,45 +495,92 @@ class BaseCrawler(ABC):
|
|||||||
if not all_pages:
|
if not all_pages:
|
||||||
return
|
return
|
||||||
|
|
||||||
safe_name = safe_filename(self.name)
|
# 确定汇总文件名(使用输出目录名,而不是任务名)
|
||||||
|
output_dir_name = os.path.basename(self.output_dir)
|
||||||
# === 生成汇总 Markdown ===
|
safe_name = safe_filename(output_dir_name)
|
||||||
combined_md = f"# {self.name}全集\n\n"
|
|
||||||
combined_md += f"**生成时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
|
||||||
combined_md += f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。\n\n"
|
|
||||||
combined_md += "---\n\n"
|
|
||||||
|
|
||||||
# 添加每篇内容
|
|
||||||
for page in all_pages:
|
|
||||||
combined_md += f"## {page['title']}\n\n"
|
|
||||||
combined_md += f"**原文链接**: {page['url']}\n\n"
|
|
||||||
combined_md += page["markdown"]
|
|
||||||
combined_md += "\n\n---\n\n"
|
|
||||||
|
|
||||||
md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
|
md_path = os.path.join(self.output_dir, f"{safe_name}_汇总.md")
|
||||||
|
docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
|
||||||
|
|
||||||
|
# === 处理 Markdown ===
|
||||||
|
existing_urls = set()
|
||||||
|
existing_content = ""
|
||||||
|
|
||||||
|
# 如果文件已存在,读取现有内容并提取已存在的URL
|
||||||
|
if os.path.exists(md_path):
|
||||||
|
with open(md_path, "r", encoding="utf-8") as f:
|
||||||
|
existing_content = f.read()
|
||||||
|
# 提取已存在的URL(用于去重)
|
||||||
|
url_pattern = r'\*\*原文链接\*\*: (https?://[^\s\n]+)'
|
||||||
|
existing_urls = set(re.findall(url_pattern, existing_content))
|
||||||
|
|
||||||
|
# 过滤掉已存在的页面(基于URL去重)
|
||||||
|
new_pages = [page for page in all_pages if page['url'] not in existing_urls]
|
||||||
|
|
||||||
|
if not new_pages and existing_content:
|
||||||
|
print(f" 所有页面已存在,无需更新: {md_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 生成新内容
|
||||||
|
new_md_content = ""
|
||||||
|
for page in new_pages:
|
||||||
|
new_md_content += f"## {page['title']}\n\n"
|
||||||
|
new_md_content += f"**原文链接**: {page['url']}\n\n"
|
||||||
|
new_md_content += page["markdown"]
|
||||||
|
new_md_content += "\n\n---\n\n"
|
||||||
|
|
||||||
|
# 追加或创建文件
|
||||||
|
if existing_content:
|
||||||
|
# 追加模式:在现有内容后追加新内容
|
||||||
|
combined_md = existing_content.rstrip() + "\n\n" + new_md_content
|
||||||
|
print(f" 追加 {len(new_pages)} 篇新内容到现有文档")
|
||||||
|
else:
|
||||||
|
# 新建模式:创建新文档
|
||||||
|
combined_md = f"# {output_dir_name}全集\n\n" + new_md_content
|
||||||
|
|
||||||
with open(md_path, "w", encoding="utf-8") as f:
|
with open(md_path, "w", encoding="utf-8") as f:
|
||||||
f.write(combined_md)
|
f.write(combined_md)
|
||||||
print(f" 汇总 Markdown: {md_path}")
|
print(f" 汇总 Markdown: {md_path}")
|
||||||
|
|
||||||
# === 生成汇总 Word 文档 ===
|
# === 处理 Word 文档 ===
|
||||||
doc = Document()
|
if os.path.exists(docx_path):
|
||||||
doc.add_heading(f'{self.name}全集', 0)
|
# 如果Word文档已存在,重新生成(因为python-docx不支持追加)
|
||||||
|
doc = Document(docx_path)
|
||||||
|
# 提取已存在的URL
|
||||||
|
existing_doc_urls = set()
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
if para.runs and "原文链接:" in para.text:
|
||||||
|
url_match = re.search(r'原文链接: (https?://[^\s\n]+)', para.text)
|
||||||
|
if url_match:
|
||||||
|
existing_doc_urls.add(url_match.group(1))
|
||||||
|
|
||||||
intro = doc.add_paragraph()
|
# 过滤新页面
|
||||||
intro.add_run(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}").italic = True
|
new_pages_for_doc = [page for page in all_pages if page['url'] not in existing_doc_urls]
|
||||||
doc.add_paragraph(f"本文档汇总了零差云控官网的所有{self.name}内容,共 {len(all_pages)} 篇。")
|
|
||||||
|
if new_pages_for_doc:
|
||||||
|
# 添加新内容
|
||||||
|
for page in new_pages_for_doc:
|
||||||
|
doc.add_heading(page["title"], level=1)
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.add_run(f"原文链接: {page['url']}").italic = True
|
||||||
|
self.add_content_to_docx(doc, page["content"], self.output_dir)
|
||||||
doc.add_page_break()
|
doc.add_page_break()
|
||||||
|
doc.save(docx_path)
|
||||||
|
print(f" 追加 {len(new_pages_for_doc)} 篇新内容到 Word 文档")
|
||||||
|
else:
|
||||||
|
print(f" Word 文档无需更新: {docx_path}")
|
||||||
|
else:
|
||||||
|
# 新建Word文档
|
||||||
|
doc = Document()
|
||||||
|
doc.add_heading(f'{output_dir_name}全集', level=1)
|
||||||
|
|
||||||
# 添加每篇内容
|
|
||||||
for page in all_pages:
|
for page in all_pages:
|
||||||
doc.add_heading(page["title"], level=1)
|
doc.add_heading(page["title"], level=1)
|
||||||
p = doc.add_paragraph()
|
p = doc.add_paragraph()
|
||||||
p.add_run(f"原文链接: {page['url']}").italic = True
|
p.add_run(f"原文链接: {page['url']}").italic = True
|
||||||
|
|
||||||
self.add_content_to_docx(doc, page["content"], self.output_dir)
|
self.add_content_to_docx(doc, page["content"], self.output_dir)
|
||||||
doc.add_page_break()
|
doc.add_page_break()
|
||||||
|
|
||||||
docx_path = os.path.join(self.output_dir, f"{safe_name}_汇总.docx")
|
|
||||||
doc.save(docx_path)
|
doc.save(docx_path)
|
||||||
print(f" 汇总 Word: {docx_path}")
|
print(f" 汇总 Word: {docx_path}")
|
||||||
|
|
||||||
@@ -451,6 +629,67 @@ class BaseCrawler(ABC):
|
|||||||
print(f"输出目录: {self.output_dir}")
|
print(f"输出目录: {self.output_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
def _add_table_to_docx(self, doc: Document, table_element: BeautifulSoup):
|
||||||
|
"""
|
||||||
|
将 HTML 表格添加到 Word 文档
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: Document 对象
|
||||||
|
table_element: 表格元素
|
||||||
|
"""
|
||||||
|
rows = table_element.find_all('tr')
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 获取最大列数(考虑 colspan)
|
||||||
|
max_cols = 0
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_all(['td', 'th'])
|
||||||
|
col_count = 0
|
||||||
|
for cell in cells:
|
||||||
|
colspan = int(cell.get('colspan', 1))
|
||||||
|
col_count += colspan
|
||||||
|
max_cols = max(max_cols, col_count)
|
||||||
|
|
||||||
|
if max_cols == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建 Word 表格
|
||||||
|
try:
|
||||||
|
word_table = doc.add_table(rows=len(rows), cols=max_cols)
|
||||||
|
word_table.style = 'Table Grid'
|
||||||
|
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
cells = row.find_all(['td', 'th'])
|
||||||
|
col_idx = 0
|
||||||
|
for cell in cells:
|
||||||
|
if col_idx >= max_cols:
|
||||||
|
break
|
||||||
|
text = cell.get_text(strip=True)
|
||||||
|
colspan = int(cell.get('colspan', 1))
|
||||||
|
rowspan = int(cell.get('rowspan', 1))
|
||||||
|
|
||||||
|
# 设置单元格文本
|
||||||
|
word_table.rows[i].cells[col_idx].text = text
|
||||||
|
|
||||||
|
# 处理合并单元格(python-docx 的合并需要特殊处理)
|
||||||
|
# 注意:python-docx 的合并单元格功能有限,这里先简单处理
|
||||||
|
if colspan > 1 or rowspan > 1:
|
||||||
|
# 对于合并单元格,python-docx 需要手动合并
|
||||||
|
# 这里先标记,后续可以改进
|
||||||
|
pass
|
||||||
|
|
||||||
|
col_idx += colspan
|
||||||
|
except Exception as e:
|
||||||
|
# 如果表格创建失败,降级为文本
|
||||||
|
print(f" 表格创建失败,降级为文本: {e}")
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_all(['td', 'th'])
|
||||||
|
row_text = ' | '.join([cell.get_text(strip=True) for cell in cells])
|
||||||
|
if row_text.strip():
|
||||||
|
doc.add_paragraph(row_text)
|
||||||
|
|
||||||
|
|
||||||
class StandardCrawler(BaseCrawler):
|
class StandardCrawler(BaseCrawler):
|
||||||
"""
|
"""
|
||||||
标准爬虫类
|
标准爬虫类
|
||||||
|
|||||||
@@ -102,8 +102,6 @@ CRAWL_TASKS = {
|
|||||||
"static_pages": [
|
"static_pages": [
|
||||||
"/about/about-us.html",
|
"/about/about-us.html",
|
||||||
"/about/contact-us.html",
|
"/about/contact-us.html",
|
||||||
"/about/join-us.html",
|
|
||||||
"/about/152.html", # 诚招代理
|
|
||||||
],
|
],
|
||||||
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
|
"content_selector": "div.about_us1,div.page-title,div.about_company,div.contact_us,div.web_contact", # 多区域布局
|
||||||
"title_selector": "h1,h2",
|
"title_selector": "h1,h2",
|
||||||
@@ -129,5 +127,17 @@ CRAWL_TASKS = {
|
|||||||
"title_selector": "h1",
|
"title_selector": "h1",
|
||||||
"title_index": 0,
|
"title_index": 0,
|
||||||
},
|
},
|
||||||
|
# 服务与支持详细页面(从索引页提取)
|
||||||
|
"service_detail": {
|
||||||
|
"name": "服务与支持-详细页面",
|
||||||
|
"output_dir": "服务与支持", # 输出到同一个目录
|
||||||
|
"index_url": "/Service/index.html",
|
||||||
|
"link_pattern": "/Service/",
|
||||||
|
"link_suffix": ".html",
|
||||||
|
"exclude_patterns": ["index.html"],
|
||||||
|
"content_selector": "div.news_text_p,div.news_text,div.content,div.content-section", # 多种布局支持
|
||||||
|
"title_selector": "h1",
|
||||||
|
"title_index": 1,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -77,16 +77,22 @@ class ProductCrawler(BaseCrawler):
|
|||||||
|
|
||||||
if os.path.exists(local_path):
|
if os.path.exists(local_path):
|
||||||
try:
|
try:
|
||||||
|
# 图片前添加空行
|
||||||
|
doc.add_paragraph()
|
||||||
doc.add_picture(local_path, width=Inches(4.5))
|
doc.add_picture(local_path, width=Inches(4.5))
|
||||||
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
# 图片后添加空行
|
||||||
|
doc.add_paragraph()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" Word插入图片失败: {local_path} - {e}")
|
print(f" Word插入图片失败: {local_path} - {e}")
|
||||||
|
|
||||||
elif element.name.startswith('h'):
|
elif element.name.startswith('h'):
|
||||||
text = element.get_text(strip=True)
|
text = element.get_text(strip=True)
|
||||||
if text and '零差云控' not in text:
|
if text and '零差云控' not in text:
|
||||||
|
# HTML h1-h6 直接映射到 Word Heading 1-6
|
||||||
|
# 限制在 1-9 范围内(Word 支持的最大标题级别)
|
||||||
level = int(element.name[1])
|
level = int(element.name[1])
|
||||||
doc.add_heading(text, level=min(level + 1, 9))
|
doc.add_heading(text, level=min(level, 9))
|
||||||
|
|
||||||
elif element.name == 'table':
|
elif element.name == 'table':
|
||||||
# 处理表格
|
# 处理表格
|
||||||
|
|||||||
Reference in New Issue
Block a user