You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
4.0 KiB

1 month ago
from Config.Config import MODEL_API_KEY, MODEL_NAME
1 month ago
from openai import OpenAI
1 month ago
import docx
import os
1 month ago
1 month ago
# 初始化通义千问客户端
1 month ago
client = OpenAI(
1 month ago
api_key=MODEL_API_KEY,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
1 month ago
)
1 month ago
def call_qwen_plus(prompt, stream_callback=None):
"""调用通义千问API"""
1 month ago
try:
response = client.chat.completions.create(
1 month ago
model=MODEL_NAME,
1 month ago
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
1 month ago
temperature=0.3,
stream=True
1 month ago
)
1 month ago
full_response = ""
for chunk in response:
content = chunk.choices[0].delta.content
if content:
full_response += content
if stream_callback:
stream_callback(content)
return full_response
1 month ago
except Exception as e:
1 month ago
print(f"调用通义千问API出错: {str(e)}")
1 month ago
return ""
1 month ago
def read_docx(file_path):
"""读取docx文件内容"""
try:
1 month ago
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs if para.text])
1 month ago
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return ""
def save_to_txt(content, file_path, mode='a'):
1 month ago
"""保存内容到txt文件"""
1 month ago
try:
with open(file_path, mode, encoding='utf-8') as f:
1 month ago
f.write(content + "\n")
return True
1 month ago
except Exception as e:
1 month ago
print(f"保存文件出错: {str(e)}")
return False
1 month ago
1 month ago
def split_text(text, chunk_size=6000):
"""按约6000字符分割文本优先在段落结束处分隔"""
chunks = []
current_chunk = ""
paragraphs = text.split('\n\n')
for para in paragraphs:
if len(current_chunk) + len(para) > chunk_size and current_chunk:
chunks.append(current_chunk)
current_chunk = para
else:
if current_chunk:
current_chunk += '\n\n' + para
else:
current_chunk = para
if current_chunk:
chunks.append(current_chunk)
return chunks
1 month ago
1 month ago
PROMPT_TEMPLATE = """
请分析以下数学教学内容直接返回处理后的文本
1. 根据每个段落间的逻辑关系判断是不是强相关一致内容的划分为同一个段落否则视为两个段落
2. 不同段落间用两个换行符分隔
3. 不要添加任何额外格式或标记绝对不要使用markdown格式返回
1 month ago
1 month ago
待处理内容
{text_chunk}
"""
1 month ago
1 month ago
def ask_llm(text_chunk, is_final=False):
"""调用大模型并处理响应"""
prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk, is_final=is_final)
def stream_callback(chunk):
"""SSE流式回调函数"""
print(chunk, end='', flush=True)
try:
response = call_qwen_plus(prompt, stream_callback=stream_callback)
return response
except Exception as e:
print(f"调用大模型出错: {str(e)}")
return None
1 month ago
def process_document(input_path, output_dir):
"""处理文档主函数"""
text = read_docx(input_path)
if not text:
print("无法读取输入文件内容")
return False
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
chunks = split_text(text)
for i, chunk in enumerate(chunks):
print(f"正在处理第{i+1}/{len(chunks)}个分块...")
response = ask_llm(chunk)
if response:
output_file = os.path.join(output_dir, f"{i+1}.txt")
save_to_txt(response, output_file, mode='w')
print(f"处理完成,结果已保存到目录: {output_dir}")
return True
1 month ago
1 month ago
if __name__ == "__main__":
input_file = '../Txt/小学数学(史校长).docx'
1 month ago
output_dir = '../Txt/processed_chunks' # 改为输出目录
process_document(input_file, output_dir)