'commit'

4 weeks ago · 2a77361492
parent 14c5906c3f
commit 2a77361492
2 changed files with 144 additions and 0 deletions
--- a/dsRag/Tools/T4_DocxProcessor.py
+++ b/dsRag/Tools/T4_DocxProcessor.py
@ -0,0 +1,144 @@
+from openai import OpenAI
+from Config import Config
+
+class DocxProcessor:
+    def __init__(self):
+        # 独立初始化DeepSeek客户端
+        self.client = OpenAI(
+            api_key=Config.DEEPSEEK_API_KEY,
+            base_url=Config.DEEPSEEK_URL
+        )
+    
+    def call_deepseek(self, prompt):
+        """独立调用DeepSeek API"""
+        try:
+            response = self.client.chat.completions.create(
+                model="deepseek-chat",
+                messages=[
+                    {"role": "system", "content": "你是一个专业的文档分析助手"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.3
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"调用DeepSeek API出错: {str(e)}")
+            return ""
+
+    def ask_llm(self, text_chunk):
+        """向大模型提问并获取响应"""
+        prompt = f"""请分析以下文本段落并返回有价值的内容:
+要求:
+1. 保持原文关键信息
+2. 用清晰的格式返回
+3. 可包含简要总结
+
+文本段落内容:
+{text_chunk}"""
+        return self.call_deepseek(prompt)
+
+# 初始化DeepSeek客户端
+client = OpenAI(
+    api_key=Config.DEEPSEEK_API_KEY,
+    base_url=Config.DEEPSEEK_URL
+)
+
+def split_text(text, chunk_size=6000):
+    """按段落分割文本，确保每个块接近6000字"""
+    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    
+    for para in paragraphs:
+        para_length = len(para)
+        if current_length + para_length > chunk_size and current_chunk:
+            chunks.append('\n'.join(current_chunk))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(para)
+        current_length += para_length
+    
+    if current_chunk:
+        chunks.append('\n'.join(current_chunk))
+    return chunks
+
+def ask_llm(text_chunk):
+    """向大模型提问并获取响应"""
+    prompt = f"""请分析以下文本段落并返回有价值的内容:
+要求:
+1. 保持原文关键信息
+2. 用清晰的格式返回
+3. 可包含简要总结
+
+文本段落内容:
+{text_chunk}"""
+    return call_deepseek_api(prompt)
+
+def process_document(input_file, output_file):
+    """处理文档主流程"""
+    text = read_docx(input_file)
+    chunks = split_text(text)
+    
+    for i, chunk in enumerate(chunks, 1):
+        print(f"正在处理第{i}个段落...")
+        try:
+            response = ask_llm(chunk)
+            save_to_txt(f"段落{i}响应:\n{response}", output_file)
+        except Exception as e:
+            save_to_txt(f"段落{i}处理失败: {str(e)}", output_file)
+    
+    print(f"处理完成，结果已保存到 {output_file}")
+
+
+def read_docx(file_path):
+    """读取docx文件内容"""
+    from docx import Document
+    try:
+        doc = Document(file_path)
+        return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
+    except Exception as e:
+        print(f"读取docx文件出错: {str(e)}")
+        return ""
+
+def save_to_txt(content, file_path, mode='a'):
+    """将内容保存到txt文件"""
+    try:
+        with open(file_path, mode, encoding='utf-8') as f:
+            f.write(content + '\n\n')
+    except Exception as e:
+        print(f"保存到txt文件出错: {str(e)}")
+
+# 在process_document方法中调用时，请确保output_file参数是完整的文件路径
+
+
+def call_deepseek_api(prompt, stream_callback=None):
+    """流式调用DeepSeek API"""
+    try:
+        response = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": "system", "content": "你是一个专业的文档分析助手"},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.3,
+            stream=True
+        )
+        
+        full_response = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content = chunk.choices[0].delta.content
+                full_response += content
+                if stream_callback:
+                    stream_callback(content)
+        
+        return full_response
+    except Exception as e:
+        print(f"调用DeepSeek API出错: {str(e)}")
+
+
+if __name__ == "__main__":
+    input_file = '../Txt/小学数学（史校长）.docx'
+    output_file ='../Txt/小学数学（史校长）.txt'
+    process_document(input_file, output_file)
--- a/dsRag/Txt/小学数学（史校长）.docx
+++ b/dsRag/Txt/小学数学（史校长）.docx