main
HuangHai 4 weeks ago
parent 57a2ed2788
commit 35d113b86f

@ -8,10 +8,4 @@ if __name__ == "__main__":
word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
output_dir = "../static/Images" # 图片输出目录
os.makedirs(output_dir, exist_ok=True)
images = extract_images_from_docx(word_path, output_dir)
# 打印结果
for img in images:
print(f"图片保存至: {img['image_path']}")
loc = img['location']
print(f"位置信息: 段落 {loc['paragraph_index']}")
extract_images_from_docx(word_path, output_dir)

@ -1,6 +1,6 @@
import os
import docx
from docx.oxml.ns import nsmap
def read_word_content(docx_path):

@ -0,0 +1,99 @@
import os
from Util.SplitDocxUtil import SplitDocxUtil
def split_into_blocks(text):
"""按行遍历文本,发现'问题X''话题X'时开始分割,只移除前缀但保留整行内容"""
blocks = []
current_block = []
in_block = False
for line in text.splitlines():
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
if in_block:
blocks.append('\n'.join(current_block))
current_block = []
in_block = True
# 循环移除问题和话题前缀后的数字
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
if line.startswith(('问题', '话题')):
line = line[2:] if len(line) > 2 else line
elif line and line[0].isdigit():
line = line[1:] if len(line) > 1 else line
line = line.strip()
if in_block and line: # 只添加非空行
current_block.append(line)
if current_block:
blocks.append('\n'.join(current_block))
return [(i+1, block) for i, block in enumerate(blocks)]
def process_document(input_path, output_dir):
"""处理文档主函数"""
text = SplitDocxUtil.read_docx(input_path)
if not text:
print("无法读取输入文件内容")
return False
# 清空目录操作已移到process_directory函数中
chunks = split_into_blocks(text)
print(f"共分割出{len(chunks)}个段落块")
saved_count = 0
# 从输入文件名中提取MATH_1部分
file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
for chunk_num, chunk in chunks:
chunk = chunk.strip() # 确保去除空白字符
output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt")
if save_to_txt(chunk, output_file, mode='w'):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
return saved_count > 0
# 保留原有的save_to_txt函数
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
def process_directory(input_dir, output_dir):
"""处理目录下所有docx文件"""
if not os.path.exists(input_dir):
print(f"输入目录不存在: {input_dir}")
return False
# 确保输出目录存在并清空目录(只需一次)
if os.path.exists(output_dir):
for file in os.listdir(output_dir):
os.remove(os.path.join(output_dir, file))
os.makedirs(output_dir, exist_ok=True)
docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
if not docx_files:
print(f"目录中没有找到docx文件: {input_dir}")
return False
success_count = 0
for docx_file in docx_files:
input_path = os.path.join(input_dir, docx_file)
print(f"正在处理文件: {docx_file}")
if process_document(input_path, output_dir):
success_count += 1
print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件")
return success_count > 0
if __name__ == "__main__":
input_dir = '../static/Txt'
output_dir = '../Txt'
process_directory(input_dir, output_dir)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 62 KiB

@ -28,9 +28,10 @@ def extract_images_from_docx(docx_path, output_folder):
# 加载主文档
doc = Document(docx_path)
image_data = []
img_counter = 1
idx = 0
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
@ -57,24 +58,13 @@ def extract_images_from_docx(docx_path, output_folder):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
# 名称为uuid
img_name = f"{uuid.uuid4().hex}{ext}"
idx = idx + 1
img_name = f"{idx}{ext}"
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
# 记录位置信息
location = {
"paragraph_index": para_idx
}
image_data.append({
"image_path": dest_path,
"location": location
})
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Loading…
Cancel
Save