diff --git a/dsRag/Tools/T4_DocxProcessor.py b/dsRag/Tools/T4_DocxProcessor.py new file mode 100644 index 00000000..31408ceb --- /dev/null +++ b/dsRag/Tools/T4_DocxProcessor.py @@ -0,0 +1,144 @@ +from openai import OpenAI +from Config import Config + +class DocxProcessor: + def __init__(self): + # 独立初始化DeepSeek客户端 + self.client = OpenAI( + api_key=Config.DEEPSEEK_API_KEY, + base_url=Config.DEEPSEEK_URL + ) + + def call_deepseek(self, prompt): + """独立调用DeepSeek API""" + try: + response = self.client.chat.completions.create( + model="deepseek-chat", + messages=[ + {"role": "system", "content": "你是一个专业的文档分析助手"}, + {"role": "user", "content": prompt} + ], + temperature=0.3 + ) + return response.choices[0].message.content + except Exception as e: + print(f"调用DeepSeek API出错: {str(e)}") + return "" + + def ask_llm(self, text_chunk): + """向大模型提问并获取响应""" + prompt = f"""请分析以下文本段落并返回有价值的内容: +要求: +1. 保持原文关键信息 +2. 用清晰的格式返回 +3. 可包含简要总结 + +文本段落内容: +{text_chunk}""" + return self.call_deepseek(prompt) + +# 初始化DeepSeek客户端 +client = OpenAI( + api_key=Config.DEEPSEEK_API_KEY, + base_url=Config.DEEPSEEK_URL +) + +def split_text(text, chunk_size=6000): + """按段落分割文本,确保每个块接近6000字""" + paragraphs = [p.strip() for p in text.split('\n') if p.strip()] + chunks = [] + current_chunk = [] + current_length = 0 + + for para in paragraphs: + para_length = len(para) + if current_length + para_length > chunk_size and current_chunk: + chunks.append('\n'.join(current_chunk)) + current_chunk = [] + current_length = 0 + current_chunk.append(para) + current_length += para_length + + if current_chunk: + chunks.append('\n'.join(current_chunk)) + return chunks + +def ask_llm(text_chunk): + """向大模型提问并获取响应""" + prompt = f"""请分析以下文本段落并返回有价值的内容: +要求: +1. 保持原文关键信息 +2. 用清晰的格式返回 +3. 可包含简要总结 + +文本段落内容: +{text_chunk}""" + return call_deepseek_api(prompt) + +def process_document(input_file, output_file): + """处理文档主流程""" + text = read_docx(input_file) + chunks = split_text(text) + + for i, chunk in enumerate(chunks, 1): + print(f"正在处理第{i}个段落...") + try: + response = ask_llm(chunk) + save_to_txt(f"段落{i}响应:\n{response}", output_file) + except Exception as e: + save_to_txt(f"段落{i}处理失败: {str(e)}", output_file) + + print(f"处理完成,结果已保存到 {output_file}") + + +def read_docx(file_path): + """读取docx文件内容""" + from docx import Document + try: + doc = Document(file_path) + return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()]) + except Exception as e: + print(f"读取docx文件出错: {str(e)}") + return "" + +def save_to_txt(content, file_path, mode='a'): + """将内容保存到txt文件""" + try: + with open(file_path, mode, encoding='utf-8') as f: + f.write(content + '\n\n') + except Exception as e: + print(f"保存到txt文件出错: {str(e)}") + +# 在process_document方法中调用时,请确保output_file参数是完整的文件路径 + + +def call_deepseek_api(prompt, stream_callback=None): + """流式调用DeepSeek API""" + try: + response = client.chat.completions.create( + model="deepseek-chat", + messages=[ + {"role": "system", "content": "你是一个专业的文档分析助手"}, + {"role": "user", "content": prompt} + ], + temperature=0.3, + stream=True + ) + + full_response = "" + for chunk in response: + if chunk.choices[0].delta.content: + content = chunk.choices[0].delta.content + full_response += content + if stream_callback: + stream_callback(content) + + return full_response + except Exception as e: + print(f"调用DeepSeek API出错: {str(e)}") + + +if __name__ == "__main__": + input_file = '../Txt/小学数学(史校长).docx' + output_file ='../Txt/小学数学(史校长).txt' + process_document(input_file, output_file) \ No newline at end of file diff --git a/dsRag/Txt/小学数学(史校长).docx b/dsRag/Txt/小学数学(史校长).docx new file mode 100644 index 00000000..4fa00996 Binary files /dev/null and b/dsRag/Txt/小学数学(史校长).docx differ