from Config.Config import MODEL_API_KEY, MODEL_NAME from openai import OpenAI import docx import os # 初始化通义千问客户端 client = OpenAI( api_key=MODEL_API_KEY, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) def call_qwen_plus(prompt, stream_callback=None): """调用通义千问API""" try: response = client.chat.completions.create( model=MODEL_NAME, messages=[ {"role": "system", "content": "你是一个专业的文档分析助手"}, {"role": "user", "content": prompt} ], temperature=0.3, stream=True ) full_response = "" for chunk in response: content = chunk.choices[0].delta.content if content: full_response += content if stream_callback: stream_callback(content) return full_response except Exception as e: print(f"调用通义千问API出错: {str(e)}") return "" def read_docx(file_path): """读取docx文件内容""" try: doc = docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs if para.text]) except Exception as e: print(f"读取docx文件出错: {str(e)}") return "" def save_to_txt(content, file_path, mode='a'): """保存内容到txt文件""" try: with open(file_path, mode, encoding='utf-8') as f: f.write(content + "\n") return True except Exception as e: print(f"保存文件出错: {str(e)}") return False def split_text(text, chunk_size=6000): """按约6000字符分割文本,优先在段落结束处分隔""" chunks = [] current_chunk = "" paragraphs = text.split('\n\n') for para in paragraphs: if len(current_chunk) + len(para) > chunk_size and current_chunk: chunks.append(current_chunk) current_chunk = para else: if current_chunk: current_chunk += '\n\n' + para else: current_chunk = para if current_chunk: chunks.append(current_chunk) return chunks PROMPT_TEMPLATE = """ 请分析以下数学教学内容,直接返回处理后的文本: 1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。 2. 不同段落间用两个换行符分隔 3. 不要添加任何额外格式或标记,绝对不要使用markdown格式返回。 待处理内容: {text_chunk} """ def ask_llm(text_chunk): """向大模型提问并获取响应""" prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk) return call_qwen_plus(prompt) def process_document(input_path, output_dir): """处理文档主函数""" text = read_docx(input_path) if not text: print("无法读取输入文件内容") return False # 确保输出目录存在 os.makedirs(output_dir, exist_ok=True) chunks = split_text(text) for i, chunk in enumerate(chunks): print(f"正在处理第{i+1}/{len(chunks)}个分块...") response = ask_llm(chunk) if response: output_file = os.path.join(output_dir, f"{i+1}.txt") save_to_txt(response, output_file, mode='w') print(f"处理完成,结果已保存到目录: {output_dir}") return True if __name__ == "__main__": input_file = '../Txt/小学数学(史校长).docx' output_dir = '../Txt/processed_chunks' # 改为输出目录 process_document(input_file, output_dir)