新增标题层级处理规则:1. 新增主页链接;2.新增docx后处理,合并同一层级的标题;3. 优化层级,h1不重复
This commit is contained in:
164
zeroerr_crawler/post_process.py
Normal file
164
zeroerr_crawler/post_process.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Word 文档后处理模块
|
||||
优化生成的 Word 文档格式
|
||||
"""
|
||||
|
||||
import re
|
||||
from docx import Document
|
||||
|
||||
|
||||
def post_process_docx_headings(docx_path: str):
|
||||
"""
|
||||
后处理 Word 文档:优化相同层级的连续标题
|
||||
|
||||
规则:
|
||||
1. 如果两个相同层级的连续标题之间没有文字内容和图片
|
||||
2. 如果一个标题包含另一个,则保留较长的
|
||||
3. 如果不包含,则合并为一个标题
|
||||
4. 如果中间有图片,不合并
|
||||
|
||||
Args:
|
||||
docx_path: Word 文档路径
|
||||
"""
|
||||
try:
|
||||
doc = Document(docx_path)
|
||||
paragraphs = doc.paragraphs
|
||||
|
||||
# 找到所有标题段落及其索引
|
||||
heading_indices = []
|
||||
for i, para in enumerate(paragraphs):
|
||||
if para.style.name.startswith('Heading'):
|
||||
# 提取标题级别(Heading 1 -> 1, Heading 2 -> 2, ...)
|
||||
level_match = re.search(r'Heading\s+(\d+)', para.style.name)
|
||||
if level_match:
|
||||
level = int(level_match.group(1))
|
||||
text = para.text.strip()
|
||||
if text: # 只处理非空标题
|
||||
heading_indices.append({
|
||||
'index': i,
|
||||
'level': level,
|
||||
'text': text,
|
||||
'paragraph': para
|
||||
})
|
||||
|
||||
if len(heading_indices) < 2:
|
||||
return # 至少需要两个标题才能合并
|
||||
|
||||
# 需要删除的段落索引
|
||||
to_remove = set()
|
||||
# 需要修改的段落(合并标题)
|
||||
to_modify = {}
|
||||
|
||||
i = 0
|
||||
while i < len(heading_indices) - 1:
|
||||
current = heading_indices[i]
|
||||
next_heading = heading_indices[i + 1]
|
||||
|
||||
# 只处理相同层级的连续标题
|
||||
if current['level'] == next_heading['level']:
|
||||
# 检查两个标题之间是否有文字内容或图片
|
||||
start_idx = current['index'] + 1
|
||||
end_idx = next_heading['index']
|
||||
|
||||
has_content = False
|
||||
for j in range(start_idx, end_idx):
|
||||
para = paragraphs[j]
|
||||
# 如果遇到其他标题,说明不是连续的
|
||||
if para.style.name.startswith('Heading'):
|
||||
has_content = True
|
||||
break
|
||||
|
||||
# 检查是否有图片(通过检查段落中的 drawing 元素)
|
||||
has_image = False
|
||||
try:
|
||||
# 方法1: 检查段落 XML 中是否包含 drawing 标签
|
||||
if hasattr(para, '_element'):
|
||||
para_xml = para._element.xml if hasattr(para._element, 'xml') else str(para._element)
|
||||
if 'drawing' in para_xml.lower():
|
||||
has_image = True
|
||||
|
||||
# 方法2: 检查段落中的运行(runs)是否有图片
|
||||
if not has_image and hasattr(para, 'runs'):
|
||||
for run in para.runs:
|
||||
if hasattr(run, '_element'):
|
||||
try:
|
||||
run_xml = run._element.xml if hasattr(run._element, 'xml') else str(run._element)
|
||||
if 'drawing' in run_xml.lower():
|
||||
has_image = True
|
||||
break
|
||||
except:
|
||||
pass
|
||||
except Exception:
|
||||
# 如果检查失败,保守处理:假设有内容,不合并
|
||||
pass
|
||||
|
||||
if has_image:
|
||||
has_content = True
|
||||
break
|
||||
|
||||
# 检查是否有文字内容(非标题段落)
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
has_content = True
|
||||
break
|
||||
|
||||
# 如果中间没有文字内容,需要处理
|
||||
if not has_content:
|
||||
current_text = current['text']
|
||||
next_text = next_heading['text']
|
||||
|
||||
# 判断包含关系(较短的标题是否包含在较长的标题中)
|
||||
if len(current_text) <= len(next_text):
|
||||
# 当前标题较短,检查是否包含在下一个标题中
|
||||
if current_text in next_text:
|
||||
# 当前标题包含在下一个标题中,保留较长的(下一个)
|
||||
to_remove.add(current['index'])
|
||||
print(f" 标题优化: 删除 '{current_text}'(包含在 '{next_text}' 中)")
|
||||
i += 1 # 跳过下一个标题,继续检查
|
||||
continue
|
||||
else:
|
||||
# 下一个标题较短,检查是否包含在当前标题中
|
||||
if next_text in current_text:
|
||||
# 下一个标题包含在当前标题中,保留较长的(当前)
|
||||
to_remove.add(next_heading['index'])
|
||||
print(f" 标题优化: 删除 '{next_text}'(包含在 '{current_text}' 中)")
|
||||
i += 1 # 继续检查当前标题与下一个标题
|
||||
continue
|
||||
|
||||
# 不包含,合并标题
|
||||
merged_text = f"{current_text} {next_text}"
|
||||
to_modify[current['index']] = merged_text
|
||||
to_remove.add(next_heading['index'])
|
||||
print(f" 标题优化: 合并 '{current_text}' 和 '{next_text}' → '{merged_text}'")
|
||||
# 更新当前标题文本,以便继续检查与下一个标题的关系
|
||||
current['text'] = merged_text
|
||||
i += 1 # 跳过下一个标题,但继续用合并后的标题检查
|
||||
continue
|
||||
|
||||
i += 1
|
||||
|
||||
# 应用修改
|
||||
if to_remove or to_modify:
|
||||
# 修改合并的标题
|
||||
for idx, merged_text in to_modify.items():
|
||||
para = paragraphs[idx]
|
||||
para.clear()
|
||||
para.add_run(merged_text)
|
||||
|
||||
# 删除需要移除的标题(清空内容并改为普通段落)
|
||||
for idx in sorted(to_remove, reverse=True):
|
||||
para = paragraphs[idx]
|
||||
# 清空段落内容
|
||||
para.clear()
|
||||
# 改为普通段落样式(避免保留标题样式)
|
||||
para.style = doc.styles['Normal']
|
||||
|
||||
# 保存文档
|
||||
doc.save(docx_path)
|
||||
total_changes = len(to_remove) + len(to_modify)
|
||||
print(f" 标题优化完成: 处理了 {total_changes} 个标题(删除 {len(to_remove)} 个,合并 {len(to_modify)} 个)")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 警告: 标题后处理失败: {e}")
|
||||
# 失败不影响原始文档,继续执行
|
||||
|
||||
Reference in New Issue
Block a user