main
HuangHai 4 weeks ago
parent cd6697f5a7
commit afc76083b3

@ -0,0 +1,37 @@
import os
import docx
from docx.oxml.ns import nsmap
def read_word_content(docx_path):
"""遍历Word文档的每个段落输出文字或图片标识"""
try:
doc = docx.Document(docx_path)
for paragraph in doc.paragraphs:
has_image = False
# 检查段落中是否有图片
for run in paragraph.runs:
for element in run._element:
if element.tag.endswith('drawing'):
# 找到图片元素
has_image = True
break
if has_image:
break
if has_image:
print("【图片】")
elif paragraph.text.strip():
print(paragraph.text.strip())
except Exception as e:
print(f"处理Word文档时出错: {str(e)}")
if __name__ == "__main__":
# 示例用法
# 请将 'your_document.docx' 替换为你的Word文档路径
word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
if os.path.exists(word_document_path):
read_word_content(word_document_path)
else:
print(f"文件不存在: {word_document_path}")

@ -1,168 +0,0 @@
import os
import docx
from Util.WordImageUtil import extract_images_from_docx
# 定义常量
BLOCK_START_KEYWORDS = ('问题', '话题')
OUTPUT_IMAGE_DIR = "D:\\dsWork\\dsProject\\dsRag\\static\\Images"
OUTPUT_TXT_DIR = "D:\\dsWork\\dsProject\\dsRag\\Txt"
def read_docx(file_path):
"""读取docx文件内容返回包含段落索引和文本的列表"""
try:
doc = docx.Document(file_path)
return [(i, para.text) for i, para in enumerate(doc.paragraphs) if para.text]
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return []
def is_new_block_start(para_text):
"""判断段落是否是新块的开始"""
return para_text.startswith(BLOCK_START_KEYWORDS) and any(c.isdigit() for c in para_text[:5])
def split_into_blocks(paragraphs):
"""按段落遍历文本,发现'问题X''话题X'时开始分割,保留段落索引和内容"""
blocks = []
current_block = []
current_indices = []
in_block = False
for para_idx, para_text in paragraphs:
if is_new_block_start(para_text):
if in_block:
blocks.append((current_indices, '\n'.join(current_block)))
print(
f"问题/话题 {len(blocks)} 管辖段落范围: {current_indices[0]} - {current_indices[-1]} (段落数: {len(current_block)})")
print(f"当前段落块内容: {''.join(current_block)}")
print(f"当前段落块索引: {current_indices}")
current_block = [para_text]
current_indices = [para_idx]
in_block = True
elif not current_block and (para_text.strip() or para_idx == 0):
# 处理文档开头没有明确标记的问题/话题
current_block = [para_text]
current_indices = [para_idx]
if in_block and para_text.strip(): # 只添加非空段落
# 检查当前段落是否与前一个段落重复,避免重复添加
if not current_block or para_text.strip() != current_block[-1].strip():
current_block.append(para_text)
# 确保 current_indices 包含所有属于当前块的段落索引
if para_idx not in current_indices:
current_indices.append(para_idx)
if current_block:
blocks.append((current_indices, '\n'.join(current_block)))
print(
f"问题/话题 {len(blocks)} 管辖段落范围: {current_indices[0]} - {current_indices[-1]} (段落数: {len(current_block)})")
print(f"当前段落块内容: {''.join(current_block)}")
print(f"当前段落块索引: {current_indices}")
return blocks
def process_images(images, blocks):
"""处理图片与段落块的关联,并将图片信息写入对应的段落块文本中"""
for img in images:
img_para_idx = img['location']['paragraph_index']
img_name = os.path.basename(img['image_path'])
img_path = img['image_path']
assigned = False
for block_idx, (para_indices, block_text) in enumerate(blocks, 1):
# 检查图片所在的段落索引是否直接包含在段落块的索引列表中
if img_para_idx in para_indices:
print(f"图片 {img_name} 属于段落块 {block_idx} (段落索引: {para_indices})")
# 将图片信息添加到对应的段落块文本中
blocks[block_idx - 1] = (para_indices, block_text + f"\n[图片: {img_name}, 路径: {img_path}]")
assigned = True
break
# 如果图片未被分配到任何特定段落块,则将其添加到最后一个段落块
if not assigned:
if blocks:
last_block_idx = len(blocks) - 1
last_para_indices, last_block_text = blocks[last_block_idx]
blocks[last_block_idx] = (last_para_indices, last_block_text + f"\n[图片: {img_name}, 路径: {img_path}]")
print(f"图片 {img_name} 未找到精确匹配的段落块,已添加到最后一个段落块。")
else:
print(f"图片 {img_name} 未找到匹配的段落块,且没有可用的段落块。")
def save_blocks_to_txt(blocks, file_prefix, output_dir):
"""将段落块保存到TXT文件"""
saved_count = 0
for block_idx, (para_indices, block_text) in enumerate(blocks, 1):
block_text = block_text.strip()
output_file = os.path.join(output_dir, f"{file_prefix}_{block_idx}.txt")
if save_to_txt(block_text, output_file):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
return saved_count > 0
def process_document(input_path):
"""处理文档主函数"""
paragraphs = read_docx(input_path)
print("段落块信息:" + str(paragraphs))
if not paragraphs:
print("无法读取输入文件内容")
return False
os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
print(f"图片将保存到目录: {OUTPUT_IMAGE_DIR}")
images = extract_images_from_docx(input_path, OUTPUT_IMAGE_DIR)
print(f"共提取到{len(images)}张图片")
for img in images:
print(f"图片保存至: {img['image_path']}")
loc = img['location']
print(f"位置信息: 段落 {loc['paragraph_index']}")
blocks = split_into_blocks(paragraphs)
print(f"共分割出{len(blocks)}个段落块")
# 处理图片
process_images(images, blocks)
file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + \
os.path.basename(input_path).split('.')[0].split('_')[-1]
return save_blocks_to_txt(blocks, file_prefix, OUTPUT_TXT_DIR)
# 保留原有的save_to_txt函数
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
def process_directory(input_dir):
"""处理指定目录下的所有docx文件"""
docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
if not docx_files:
print(f"目录中没有找到docx文件: {input_dir}")
return False
success_count = 0
for docx_file in docx_files:
input_path = os.path.join(input_dir, docx_file)
print(f"正在处理文件: {docx_file}")
if process_document(input_path):
success_count += 1
print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件")
return success_count > 0
if __name__ == "__main__":
input_dir = '../static/Test'
process_directory(input_dir)

@ -1,4 +1,5 @@
问题1 教学建议与意义
在教学过程中,引导学生构建和理解模型,不仅能提升他们分析和解决问题的能力,还能激发他们发现问题和提出问题的意识。例如,在认识路程模型时,教师可通过生活化情境让学生理解速度的概念及其单位表示。
模型思想是《义务教育数学课程标准》中强调的核心素养之一,它帮助学生建立从现实世界抽象出数学问题的能力,并通过数学语言进行描述和解释。
因此,在“综合与实践”类教学内容中,应加强模型的应用训练,以培养学生应用数学知识解决实际问题的能力。
因此,在“综合与实践”类教学内容中,应加强模型的应用训练,以培养学生应用数学知识解决实际问题的能力。
【图片】

@ -1,3 +1 @@
问题2 我随便写点什么
[图片: edc7cfaa93dc4759b2adbd73fb764989.png, 路径: D:\dsWork\dsProject\dsRag\static\Images\edc7cfaa93dc4759b2adbd73fb764989.png]
[图片: 373866967dbc4c1fbc23c71caa4357e1.png, 路径: D:\dsWork\dsProject\dsRag\static\Images\373866967dbc4c1fbc23c71caa4357e1.png]
问题2 我随便写点什么

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Loading…
Cancel
Save