|
|
|
@ -2,6 +2,7 @@ import docx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_word_content(docx_path):
|
|
|
|
|
res = ""
|
|
|
|
|
idx = 0
|
|
|
|
|
"""遍历Word文档的每个段落,输出文字或图片标识"""
|
|
|
|
|
try:
|
|
|
|
@ -21,14 +22,45 @@ def read_word_content(docx_path):
|
|
|
|
|
|
|
|
|
|
if has_image:
|
|
|
|
|
idx = idx + 1
|
|
|
|
|
print("【图片" + str(idx) + "】")
|
|
|
|
|
res = res + "\n" + "【图片" + str(idx) + "】"
|
|
|
|
|
elif paragraph.text.strip():
|
|
|
|
|
print(paragraph.text.strip())
|
|
|
|
|
|
|
|
|
|
res = res + "\n" + paragraph.text.strip()
|
|
|
|
|
return res
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理Word文档时出错: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_into_blocks(text):
|
|
|
|
|
"""按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容"""
|
|
|
|
|
blocks = []
|
|
|
|
|
current_block = []
|
|
|
|
|
in_block = False
|
|
|
|
|
|
|
|
|
|
for line in text.splitlines():
|
|
|
|
|
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
|
|
|
|
|
if in_block:
|
|
|
|
|
blocks.append('\n'.join(current_block))
|
|
|
|
|
current_block = []
|
|
|
|
|
in_block = True
|
|
|
|
|
# 循环移除问题和话题前缀后的数字
|
|
|
|
|
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
|
|
|
|
|
if line.startswith(('问题', '话题')):
|
|
|
|
|
line = line[2:] if len(line) > 2 else line
|
|
|
|
|
elif line and line[0].isdigit():
|
|
|
|
|
line = line[1:] if len(line) > 1 else line
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if in_block and line: # 只添加非空行
|
|
|
|
|
current_block.append(line)
|
|
|
|
|
|
|
|
|
|
if current_block:
|
|
|
|
|
blocks.append('\n'.join(current_block))
|
|
|
|
|
|
|
|
|
|
return [(i + 1, block) for i, block in enumerate(blocks)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
|
|
|
|
|
read_word_content(word_document_path)
|
|
|
|
|
res = read_word_content(word_document_path)
|
|
|
|
|
q = split_into_blocks(res)
|
|
|
|
|
for x in q:
|
|
|
|
|
print(x[1])
|
|
|
|
|