|
|
|
@ -1,71 +1,138 @@
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
from Util.SplitDocxUtil import SplitDocxUtil
|
|
|
|
|
import docx
|
|
|
|
|
|
|
|
|
|
from Util.WordImageUtil import extract_images_from_docx
|
|
|
|
|
|
|
|
|
|
# 定义常量
|
|
|
|
|
BLOCK_START_KEYWORDS = ('问题', '话题')
|
|
|
|
|
OUTPUT_IMAGE_DIR = "D:\\dsWork\\dsProject\\dsRag\\static\\Images"
|
|
|
|
|
OUTPUT_TXT_DIR = "D:\\dsWork\\dsProject\\dsRag\\Txt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_docx(file_path):
|
|
|
|
|
"""读取docx文件内容,返回包含段落索引和文本的列表"""
|
|
|
|
|
try:
|
|
|
|
|
doc = docx.Document(file_path)
|
|
|
|
|
return [(i, para.text) for i, para in enumerate(doc.paragraphs) if para.text]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"读取docx文件出错: {str(e)}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def split_into_blocks(text):
|
|
|
|
|
"""按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容"""
|
|
|
|
|
|
|
|
|
|
def is_new_block_start(para_text):
|
|
|
|
|
"""判断段落是否是新块的开始"""
|
|
|
|
|
return para_text.startswith(BLOCK_START_KEYWORDS) and any(c.isdigit() for c in para_text[:5])
|
|
|
|
|
|
|
|
|
|
def split_into_blocks(paragraphs):
|
|
|
|
|
"""按段落遍历文本,发现'问题X'或'话题X'时开始分割,保留段落索引和内容"""
|
|
|
|
|
blocks = []
|
|
|
|
|
current_block = []
|
|
|
|
|
current_indices = []
|
|
|
|
|
in_block = False
|
|
|
|
|
|
|
|
|
|
for line in text.splitlines():
|
|
|
|
|
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
|
|
|
|
|
|
|
|
|
|
for para_idx, para_text in paragraphs:
|
|
|
|
|
if is_new_block_start(para_text):
|
|
|
|
|
if in_block:
|
|
|
|
|
blocks.append('\n'.join(current_block))
|
|
|
|
|
current_block = []
|
|
|
|
|
blocks.append((current_indices, '\n'.join(current_block)))
|
|
|
|
|
print(
|
|
|
|
|
f"问题/话题 {len(blocks)} 管辖段落范围: {current_indices[0]} - {current_indices[-1]} (段落数: {len(current_block)})")
|
|
|
|
|
print(f"当前段落块内容: {''.join(current_block)}")
|
|
|
|
|
print(f"当前段落块索引: {current_indices}")
|
|
|
|
|
current_block = [para_text]
|
|
|
|
|
current_indices = [para_idx]
|
|
|
|
|
in_block = True
|
|
|
|
|
# 循环移除问题和话题前缀后的数字
|
|
|
|
|
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
|
|
|
|
|
if line.startswith(('问题', '话题')):
|
|
|
|
|
line = line[2:] if len(line) > 2 else line
|
|
|
|
|
elif line and line[0].isdigit():
|
|
|
|
|
line = line[1:] if len(line) > 1 else line
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if in_block and line: # 只添加非空行
|
|
|
|
|
current_block.append(line)
|
|
|
|
|
|
|
|
|
|
elif not current_block and (para_text.strip() or para_idx == 0):
|
|
|
|
|
# 处理文档开头没有明确标记的问题/话题
|
|
|
|
|
current_block = [para_text]
|
|
|
|
|
current_indices = [para_idx]
|
|
|
|
|
|
|
|
|
|
if in_block and para_text.strip(): # 只添加非空段落
|
|
|
|
|
# 检查当前段落是否与前一个段落重复,避免重复添加
|
|
|
|
|
if not current_block or para_text.strip() != current_block[-1].strip():
|
|
|
|
|
current_block.append(para_text)
|
|
|
|
|
# 确保 current_indices 包含所有属于当前块的段落索引
|
|
|
|
|
if para_idx not in current_indices:
|
|
|
|
|
current_indices.append(para_idx)
|
|
|
|
|
|
|
|
|
|
if current_block:
|
|
|
|
|
blocks.append('\n'.join(current_block))
|
|
|
|
|
|
|
|
|
|
return [(i+1, block) for i, block in enumerate(blocks)]
|
|
|
|
|
blocks.append((current_indices, '\n'.join(current_block)))
|
|
|
|
|
print(
|
|
|
|
|
f"问题/话题 {len(blocks)} 管辖段落范围: {current_indices[0]} - {current_indices[-1]} (段落数: {len(current_block)})")
|
|
|
|
|
print(f"当前段落块内容: {''.join(current_block)}")
|
|
|
|
|
print(f"当前段落块索引: {current_indices}")
|
|
|
|
|
|
|
|
|
|
def process_document(input_path, output_dir):
|
|
|
|
|
"""处理文档主函数"""
|
|
|
|
|
text = SplitDocxUtil.read_docx(input_path)
|
|
|
|
|
if not text:
|
|
|
|
|
print("无法读取输入文件内容")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 提取图片
|
|
|
|
|
image_dir = "D:\\dsWork\\dsProject\\dsRag\\static\\Images"
|
|
|
|
|
os.makedirs(image_dir, exist_ok=True)
|
|
|
|
|
images = extract_images_from_docx(input_path, image_dir)
|
|
|
|
|
|
|
|
|
|
# 清空目录操作已移到process_directory函数中
|
|
|
|
|
chunks = split_into_blocks(text)
|
|
|
|
|
print(f"共分割出{len(chunks)}个段落块")
|
|
|
|
|
return blocks
|
|
|
|
|
|
|
|
|
|
saved_count = 0
|
|
|
|
|
# 从输入文件名中提取MATH_1部分
|
|
|
|
|
file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
|
|
|
|
|
|
|
|
|
|
for chunk_num, chunk in chunks:
|
|
|
|
|
chunk = chunk.strip() # 确保去除空白字符
|
|
|
|
|
|
|
|
|
|
# 添加关联图片信息
|
|
|
|
|
for img in images:
|
|
|
|
|
if img['location']['paragraph_index'] == chunk_num:
|
|
|
|
|
chunk += f"\n<img src='{img['image_path']}'>"
|
|
|
|
|
|
|
|
|
|
def process_images(images, blocks):
|
|
|
|
|
"""处理图片与段落块的关联,并将图片信息写入对应的段落块文本中"""
|
|
|
|
|
for img in images:
|
|
|
|
|
img_para_idx = img['location']['paragraph_index']
|
|
|
|
|
img_name = os.path.basename(img['image_path'])
|
|
|
|
|
img_path = img['image_path']
|
|
|
|
|
|
|
|
|
|
assigned = False
|
|
|
|
|
for block_idx, (para_indices, block_text) in enumerate(blocks, 1):
|
|
|
|
|
# 检查图片所在的段落索引是否直接包含在段落块的索引列表中
|
|
|
|
|
if img_para_idx in para_indices:
|
|
|
|
|
print(f"图片 {img_name} 属于段落块 {block_idx} (段落索引: {para_indices})")
|
|
|
|
|
# 将图片信息添加到对应的段落块文本中
|
|
|
|
|
blocks[block_idx - 1] = (para_indices, block_text + f"\n[图片: {img_name}, 路径: {img_path}]")
|
|
|
|
|
assigned = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
output_file = os.path.join("D:\\dsWork\\dsProject\\dsRag\\Txt", f"{file_prefix}_{chunk_num}.txt")
|
|
|
|
|
if save_to_txt(chunk, output_file, mode='w'):
|
|
|
|
|
saved_count += 1
|
|
|
|
|
# 如果图片未被分配到任何特定段落块,则将其添加到最后一个段落块
|
|
|
|
|
if not assigned:
|
|
|
|
|
if blocks:
|
|
|
|
|
last_block_idx = len(blocks) - 1
|
|
|
|
|
last_para_indices, last_block_text = blocks[last_block_idx]
|
|
|
|
|
blocks[last_block_idx] = (last_para_indices, last_block_text + f"\n[图片: {img_name}, 路径: {img_path}]")
|
|
|
|
|
print(f"图片 {img_name} 未找到精确匹配的段落块,已添加到最后一个段落块。")
|
|
|
|
|
else:
|
|
|
|
|
print(f"图片 {img_name} 未找到匹配的段落块,且没有可用的段落块。")
|
|
|
|
|
|
|
|
|
|
def save_blocks_to_txt(blocks, file_prefix, output_dir):
|
|
|
|
|
"""将段落块保存到TXT文件"""
|
|
|
|
|
saved_count = 0
|
|
|
|
|
for block_idx, (para_indices, block_text) in enumerate(blocks, 1):
|
|
|
|
|
block_text = block_text.strip()
|
|
|
|
|
output_file = os.path.join(output_dir, f"{file_prefix}_{block_idx}.txt")
|
|
|
|
|
if save_to_txt(block_text, output_file):
|
|
|
|
|
saved_count += 1
|
|
|
|
|
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
|
|
|
|
|
return saved_count > 0
|
|
|
|
|
|
|
|
|
|
def process_document(input_path):
|
|
|
|
|
"""处理文档主函数"""
|
|
|
|
|
paragraphs = read_docx(input_path)
|
|
|
|
|
|
|
|
|
|
print("段落块信息:" + str(paragraphs))
|
|
|
|
|
|
|
|
|
|
if not paragraphs:
|
|
|
|
|
print("无法读取输入文件内容")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
|
|
|
|
|
print(f"图片将保存到目录: {OUTPUT_IMAGE_DIR}")
|
|
|
|
|
images = extract_images_from_docx(input_path, OUTPUT_IMAGE_DIR)
|
|
|
|
|
print(f"共提取到{len(images)}张图片")
|
|
|
|
|
for img in images:
|
|
|
|
|
print(f"图片保存至: {img['image_path']}")
|
|
|
|
|
loc = img['location']
|
|
|
|
|
print(f"位置信息: 段落 {loc['paragraph_index']}")
|
|
|
|
|
|
|
|
|
|
blocks = split_into_blocks(paragraphs)
|
|
|
|
|
print(f"共分割出{len(blocks)}个段落块")
|
|
|
|
|
|
|
|
|
|
# 处理图片
|
|
|
|
|
process_images(images, blocks)
|
|
|
|
|
|
|
|
|
|
file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + \
|
|
|
|
|
os.path.basename(input_path).split('.')[0].split('_')[-1]
|
|
|
|
|
return save_blocks_to_txt(blocks, file_prefix, OUTPUT_TXT_DIR)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保留原有的save_to_txt函数
|
|
|
|
|
def save_to_txt(content, file_path, mode='w'):
|
|
|
|
|
"""将内容保存到文本文件"""
|
|
|
|
@ -77,34 +144,25 @@ def save_to_txt(content, file_path, mode='w'):
|
|
|
|
|
print(f"保存文件{file_path}时出错: {str(e)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def process_directory(input_dir, output_dir):
|
|
|
|
|
"""处理目录下所有docx文件"""
|
|
|
|
|
if not os.path.exists(input_dir):
|
|
|
|
|
print(f"输入目录不存在: {input_dir}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 确保输出目录存在并清空目录(只需一次)
|
|
|
|
|
if os.path.exists(output_dir):
|
|
|
|
|
for file in os.listdir(output_dir):
|
|
|
|
|
os.remove(os.path.join(output_dir, file))
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_directory(input_dir):
|
|
|
|
|
"""处理指定目录下的所有docx文件"""
|
|
|
|
|
docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
|
|
|
|
|
if not docx_files:
|
|
|
|
|
print(f"目录中没有找到docx文件: {input_dir}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
success_count = 0
|
|
|
|
|
for docx_file in docx_files:
|
|
|
|
|
input_path = os.path.join(input_dir, docx_file)
|
|
|
|
|
print(f"正在处理文件: {docx_file}")
|
|
|
|
|
if process_document(input_path, output_dir):
|
|
|
|
|
if process_document(input_path):
|
|
|
|
|
success_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件")
|
|
|
|
|
return success_count > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
input_dir = '../static/Test'
|
|
|
|
|
output_dir = '../static/Images'
|
|
|
|
|
process_directory(input_dir, output_dir)
|
|
|
|
|
process_directory(input_dir)
|
|
|
|
|