You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
4.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from Config.Config import MODEL_API_KEY, MODEL_NAME
from openai import OpenAI
import docx
import os
# 初始化通义千问客户端
client = OpenAI(
api_key=MODEL_API_KEY,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
def call_qwen_plus(prompt, stream_callback=None):
"""调用通义千问API"""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=True
)
full_response = ""
for chunk in response:
content = chunk.choices[0].delta.content
if content:
full_response += content
if stream_callback:
stream_callback(content)
return full_response
except Exception as e:
print(f"调用通义千问API出错: {str(e)}")
return ""
def read_docx(file_path):
"""读取docx文件内容"""
try:
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs if para.text])
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return ""
def save_to_txt(content, file_path, mode='a'):
"""保存内容到txt文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content + "\n")
return True
except Exception as e:
print(f"保存文件出错: {str(e)}")
return False
def split_text(text, chunk_size=6000):
"""按约6000字符分割文本优先在段落结束处分隔"""
chunks = []
current_chunk = ""
paragraphs = text.split('\n\n')
for para in paragraphs:
if len(current_chunk) + len(para) > chunk_size and current_chunk:
chunks.append(current_chunk)
current_chunk = para
else:
if current_chunk:
current_chunk += '\n\n' + para
else:
current_chunk = para
if current_chunk:
chunks.append(current_chunk)
return chunks
PROMPT_TEMPLATE = """
请分析以下数学教学内容,直接返回处理后的文本:
1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。
2. 不同段落间用两个换行符分隔
3. 不要添加任何额外格式或标记绝对不要使用markdown格式返回。
待处理内容:
{text_chunk}
"""
def ask_llm(text_chunk, is_final=False):
"""调用大模型并处理响应"""
prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk, is_final=is_final)
def stream_callback(chunk):
"""SSE流式回调函数"""
print(chunk, end='', flush=True)
try:
response = call_qwen_plus(prompt, stream_callback=stream_callback)
return response
except Exception as e:
print(f"调用大模型出错: {str(e)}")
return None
def process_document(input_path, output_dir):
"""处理文档主函数"""
text = read_docx(input_path)
if not text:
print("无法读取输入文件内容")
return False
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
chunks = split_text(text)
for i, chunk in enumerate(chunks):
print(f"正在处理第{i+1}/{len(chunks)}个分块...")
response = ask_llm(chunk)
if response:
output_file = os.path.join(output_dir, f"{i+1}.txt")
save_to_txt(response, output_file, mode='w')
print(f"处理完成,结果已保存到目录: {output_dir}")
return True
if __name__ == "__main__":
input_file = '../Txt/小学数学(史校长).docx'
output_dir = '../Txt/processed_chunks' # 改为输出目录
process_document(input_file, output_dir)