165 lines
7.3 KiB
Python
165 lines
7.3 KiB
Python
"""
|
||
Word 文档后处理模块
|
||
优化生成的 Word 文档格式
|
||
"""
|
||
|
||
import re
|
||
from docx import Document
|
||
|
||
|
||
def post_process_docx_headings(docx_path: str):
|
||
"""
|
||
后处理 Word 文档:优化相同层级的连续标题
|
||
|
||
规则:
|
||
1. 如果两个相同层级的连续标题之间没有文字内容和图片
|
||
2. 如果一个标题包含另一个,则保留较长的
|
||
3. 如果不包含,则合并为一个标题
|
||
4. 如果中间有图片,不合并
|
||
|
||
Args:
|
||
docx_path: Word 文档路径
|
||
"""
|
||
try:
|
||
doc = Document(docx_path)
|
||
paragraphs = doc.paragraphs
|
||
|
||
# 找到所有标题段落及其索引
|
||
heading_indices = []
|
||
for i, para in enumerate(paragraphs):
|
||
if para.style.name.startswith('Heading'):
|
||
# 提取标题级别(Heading 1 -> 1, Heading 2 -> 2, ...)
|
||
level_match = re.search(r'Heading\s+(\d+)', para.style.name)
|
||
if level_match:
|
||
level = int(level_match.group(1))
|
||
text = para.text.strip()
|
||
if text: # 只处理非空标题
|
||
heading_indices.append({
|
||
'index': i,
|
||
'level': level,
|
||
'text': text,
|
||
'paragraph': para
|
||
})
|
||
|
||
if len(heading_indices) < 2:
|
||
return # 至少需要两个标题才能合并
|
||
|
||
# 需要删除的段落索引
|
||
to_remove = set()
|
||
# 需要修改的段落(合并标题)
|
||
to_modify = {}
|
||
|
||
i = 0
|
||
while i < len(heading_indices) - 1:
|
||
current = heading_indices[i]
|
||
next_heading = heading_indices[i + 1]
|
||
|
||
# 只处理相同层级的连续标题
|
||
if current['level'] == next_heading['level']:
|
||
# 检查两个标题之间是否有文字内容或图片
|
||
start_idx = current['index'] + 1
|
||
end_idx = next_heading['index']
|
||
|
||
has_content = False
|
||
for j in range(start_idx, end_idx):
|
||
para = paragraphs[j]
|
||
# 如果遇到其他标题,说明不是连续的
|
||
if para.style.name.startswith('Heading'):
|
||
has_content = True
|
||
break
|
||
|
||
# 检查是否有图片(通过检查段落中的 drawing 元素)
|
||
has_image = False
|
||
try:
|
||
# 方法1: 检查段落 XML 中是否包含 drawing 标签
|
||
if hasattr(para, '_element'):
|
||
para_xml = para._element.xml if hasattr(para._element, 'xml') else str(para._element)
|
||
if 'drawing' in para_xml.lower():
|
||
has_image = True
|
||
|
||
# 方法2: 检查段落中的运行(runs)是否有图片
|
||
if not has_image and hasattr(para, 'runs'):
|
||
for run in para.runs:
|
||
if hasattr(run, '_element'):
|
||
try:
|
||
run_xml = run._element.xml if hasattr(run._element, 'xml') else str(run._element)
|
||
if 'drawing' in run_xml.lower():
|
||
has_image = True
|
||
break
|
||
except:
|
||
pass
|
||
except Exception:
|
||
# 如果检查失败,保守处理:假设有内容,不合并
|
||
pass
|
||
|
||
if has_image:
|
||
has_content = True
|
||
break
|
||
|
||
# 检查是否有文字内容(非标题段落)
|
||
text = para.text.strip()
|
||
if text:
|
||
has_content = True
|
||
break
|
||
|
||
# 如果中间没有文字内容,需要处理
|
||
if not has_content:
|
||
current_text = current['text']
|
||
next_text = next_heading['text']
|
||
|
||
# 判断包含关系(较短的标题是否包含在较长的标题中)
|
||
if len(current_text) <= len(next_text):
|
||
# 当前标题较短,检查是否包含在下一个标题中
|
||
if current_text in next_text:
|
||
# 当前标题包含在下一个标题中,保留较长的(下一个)
|
||
to_remove.add(current['index'])
|
||
print(f" 标题优化: 删除 '{current_text}'(包含在 '{next_text}' 中)")
|
||
i += 1 # 跳过下一个标题,继续检查
|
||
continue
|
||
else:
|
||
# 下一个标题较短,检查是否包含在当前标题中
|
||
if next_text in current_text:
|
||
# 下一个标题包含在当前标题中,保留较长的(当前)
|
||
to_remove.add(next_heading['index'])
|
||
print(f" 标题优化: 删除 '{next_text}'(包含在 '{current_text}' 中)")
|
||
i += 1 # 继续检查当前标题与下一个标题
|
||
continue
|
||
|
||
# 不包含,合并标题
|
||
merged_text = f"{current_text} {next_text}"
|
||
to_modify[current['index']] = merged_text
|
||
to_remove.add(next_heading['index'])
|
||
print(f" 标题优化: 合并 '{current_text}' 和 '{next_text}' → '{merged_text}'")
|
||
# 更新当前标题文本,以便继续检查与下一个标题的关系
|
||
current['text'] = merged_text
|
||
i += 1 # 跳过下一个标题,但继续用合并后的标题检查
|
||
continue
|
||
|
||
i += 1
|
||
|
||
# 应用修改
|
||
if to_remove or to_modify:
|
||
# 修改合并的标题
|
||
for idx, merged_text in to_modify.items():
|
||
para = paragraphs[idx]
|
||
para.clear()
|
||
para.add_run(merged_text)
|
||
|
||
# 删除需要移除的标题(清空内容并改为普通段落)
|
||
for idx in sorted(to_remove, reverse=True):
|
||
para = paragraphs[idx]
|
||
# 清空段落内容
|
||
para.clear()
|
||
# 改为普通段落样式(避免保留标题样式)
|
||
para.style = doc.styles['Normal']
|
||
|
||
# 保存文档
|
||
doc.save(docx_path)
|
||
total_changes = len(to_remove) + len(to_modify)
|
||
print(f" 标题优化完成: 处理了 {total_changes} 个标题(删除 {len(to_remove)} 个,合并 {len(to_modify)} 个)")
|
||
|
||
except Exception as e:
|
||
print(f" 警告: 标题后处理失败: {e}")
|
||
# 失败不影响原始文档,继续执行
|
||
|