You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.5 KiB

4 weeks ago
import os
4 weeks ago
import docx
4 weeks ago
from Util.WordImageUtil import extract_images_from_docx
4 weeks ago
4 weeks ago
def read_word_content(docx_path):
4 weeks ago
res = ""
4 weeks ago
idx = 0
4 weeks ago
"""遍历Word文档的每个段落输出文字或图片标识"""
try:
doc = docx.Document(docx_path)
4 weeks ago
4 weeks ago
for paragraph in doc.paragraphs:
has_image = False
# 检查段落中是否有图片
for run in paragraph.runs:
for element in run._element:
if element.tag.endswith('drawing'):
# 找到图片元素
has_image = True
break
if has_image:
break
4 weeks ago
4 weeks ago
if has_image:
4 weeks ago
idx = idx + 1
4 weeks ago
res = res + "\n" + "【图片" + str(idx) + ""
4 weeks ago
elif paragraph.text.strip():
4 weeks ago
res = res + "\n" + paragraph.text.strip()
return res
4 weeks ago
except Exception as e:
print(f"处理Word文档时出错: {str(e)}")
4 weeks ago
4 weeks ago
def split_into_blocks(text):
"""按行遍历文本,发现'问题X''话题X'时开始分割,只移除前缀但保留整行内容"""
blocks = []
current_block = []
in_block = False
for line in text.splitlines():
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
if in_block:
blocks.append('\n'.join(current_block))
current_block = []
in_block = True
# 循环移除问题和话题前缀后的数字
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
if line.startswith(('问题', '话题')):
line = line[2:] if len(line) > 2 else line
elif line and line[0].isdigit():
line = line[1:] if len(line) > 1 else line
line = line.strip()
if in_block and line: # 只添加非空行
current_block.append(line)
if current_block:
blocks.append('\n'.join(current_block))
return [(i + 1, block) for i, block in enumerate(blocks)]
4 weeks ago
# 保留原有的save_to_txt函数
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
4 weeks ago
if __name__ == "__main__":
word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
4 weeks ago
output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\"
4 weeks ago
extract_images_from_docx(word_document_path, output_dir)
4 weeks ago
res = read_word_content(word_document_path)
4 weeks ago
chunks = split_into_blocks(res)
for x in chunks:
4 weeks ago
print("===段落开始:===")
firstLine = x[1].split("\n")[0].strip()
content = x[1][len(firstLine):].strip()
print("firstLine=" + firstLine)
print("content=" + content)
print("===段落结束:===\n")
4 weeks ago
saved_count=0
for chunk_num, chunk in chunks:
chunk = chunk.strip() # 确保去除空白字符
output_file = os.path.join(output_dir, f"{chunk_num}.txt")
if save_to_txt(chunk, output_file, mode='w'):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")