from openai import OpenAI from Config import Config class DocxProcessor: def __init__(self): # 独立初始化DeepSeek客户端 self.client = OpenAI( api_key=Config.DEEPSEEK_API_KEY, base_url=Config.DEEPSEEK_URL ) def call_deepseek(self, prompt): """独立调用DeepSeek API""" try: response = self.client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "你是一个专业的文档分析助手"}, {"role": "user", "content": prompt} ], temperature=0.3 ) return response.choices[0].message.content except Exception as e: print(f"调用DeepSeek API出错: {str(e)}") return "" def ask_llm(self, text_chunk): """向大模型提问并获取响应""" prompt = f"""请分析以下文本段落并返回有价值的内容: 要求: 1. 保持原文关键信息 2. 用清晰的格式返回 3. 可包含简要总结 文本段落内容: {text_chunk}""" return self.call_deepseek(prompt) # 初始化DeepSeek客户端 client = OpenAI( api_key=Config.DEEPSEEK_API_KEY, base_url=Config.DEEPSEEK_URL ) def split_text(text, chunk_size=6000): """按段落分割文本,确保每个块接近6000字""" paragraphs = [p.strip() for p in text.split('\n') if p.strip()] chunks = [] current_chunk = [] current_length = 0 for para in paragraphs: para_length = len(para) if current_length + para_length > chunk_size and current_chunk: chunks.append('\n'.join(current_chunk)) current_chunk = [] current_length = 0 current_chunk.append(para) current_length += para_length if current_chunk: chunks.append('\n'.join(current_chunk)) return chunks def ask_llm(text_chunk, is_final=False): prompt = """请将以下文本按内容相关性划分为段落,要求: 1. 意思一致或强相关的内容放在同一段落 2. 每个段落有明确的主题 3. 输出格式为:## 段落主题\n段落内容\n\n""" return call_deepseek_api(prompt) def process_document(input_file, output_file): """处理文档主流程""" text = read_docx(input_file) chunks = split_text(text) for i, chunk in enumerate(chunks, 1): print(f"正在处理第{i}个段落...") try: processor = DocxProcessor() response = processor.ask_llm(chunk) paragraphs = processor.process_llm_response(response) for para in paragraphs: processor.save_to_txt(para, output_file) except Exception as e: save_to_txt(f"段落{i}处理失败: {str(e)}", output_file) print(f"处理完成,结果已保存到 {output_file}") def read_docx(file_path): """读取docx文件内容""" from docx import Document try: doc = Document(file_path) return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()]) except Exception as e: print(f"读取docx文件出错: {str(e)}") return "" def save_to_txt(content, file_path, mode='a'): """将内容保存到txt文件""" try: with open(file_path, mode, encoding='utf-8') as f: f.write(content + '\n\n') except Exception as e: print(f"保存到txt文件出错: {str(e)}") # 在process_document方法中调用时,请确保output_file参数是完整的文件路径 def call_deepseek_api(prompt, stream_callback=None): """流式调用DeepSeek API""" try: response = client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "你是一个专业的文档分析助手"}, {"role": "user", "content": prompt} ], temperature=0.3, stream=True ) full_response = "" for chunk in response: if chunk.choices[0].delta.content: content = chunk.choices[0].delta.content full_response += content if stream_callback: stream_callback(content) return full_response except Exception as e: print(f"调用DeepSeek API出错: {str(e)}") def process_llm_response(response): """处理大模型的段落划分响应""" paragraphs = [] current_para = "" for line in response.split('\n'): if line.startswith('## '): if current_para: paragraphs.append(current_para.strip()) current_para = line[3:] + '\n' # 去掉##标记 else: current_para += line + '\n' if current_para: paragraphs.append(current_para.strip()) return paragraphs if __name__ == "__main__": input_file = '../Txt/小学数学(史校长).docx' output_file ='../Txt/小学数学(史校长).txt' process_document(input_file, output_file)