'commit'

4 weeks ago · c2dac849dd
parent e55ab1d90e
commit c2dac849dd
1 changed files with 84 additions and 121 deletions
--- a/dsRag/Tools/T4_DocxProcessor.py
+++ b/dsRag/Tools/T4_DocxProcessor.py
@ -1,154 +1,117 @@
+from Config.Config import MODEL_API_KEY, MODEL_NAME
 from openai import OpenAI
-from Config import Config
+import docx
+import os

-# 初始化DeepSeek客户端
+# 初始化通义千问客户端
 client = OpenAI(
-    api_key=Config.DEEPSEEK_API_KEY,
-    base_url=Config.DEEPSEEK_URL
+    api_key=MODEL_API_KEY,
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
 )

-def call_deepseek(prompt):
-    """调用DeepSeek API"""
+def call_qwen_plus(prompt, stream_callback=None):
+    """调用通义千问API"""
    try:
        response = client.chat.completions.create(
-            model="deepseek-chat",
+            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "你是一个专业的文档分析助手"},
                {"role": "user", "content": prompt}
            ],
-            temperature=0.3
+            temperature=0.3,
+            stream=True
        )
-        return response.choices[0].message.content
+        
+        full_response = ""
+        for chunk in response:
+            content = chunk.choices[0].delta.content
+            if content:
+                full_response += content
+                if stream_callback:
+                    stream_callback(content)
+        
+        return full_response
    except Exception as e:
-        print(f"调用DeepSeek API出错: {str(e)}")
+        print(f"调用通义千问API出错: {str(e)}")
        return ""

-def ask_llm(text_chunk):
-    """向大模型提问并获取响应"""
-    PROMPT_TEMPLATE = """
-    请分析以下数学教学内容，直接返回处理后的文本：
-    1. 根据每个段落间的逻辑关系，判断是不是强相关，一致内容的划分为同一个段落，否则视为两个段落。 
-    2. 不同段落间用两个换行符分隔
-    3. 不要添加任何额外格式或标记，绝对不要使用markdown格式返回。 
-    
-    待处理内容：
-    {text_chunk}
-    """
-    
-    prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk)
-    return call_deepseek(prompt)
-
-# 修改后的处理函数
-def process_response(response):
-    # 直接返回原始响应内容，不做格式处理
-    return response
-
-
-def split_text(text, chunk_size=6000):
-    """按段落分割文本，确保每个块接近6000字"""
-    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
-    chunks = []
-    current_chunk = []
-    current_length = 0
-
-    for para in paragraphs:
-        para_length = len(para)
-        if current_length + para_length > chunk_size and current_chunk:
-            chunks.append('\n'.join(current_chunk))
-            current_chunk = []
-            current_length = 0
-        current_chunk.append(para)
-        current_length += para_length
-
-    if current_chunk:
-        chunks.append('\n'.join(current_chunk))
-    return chunks
-
-
-def process_document(input_file, output_file):
-    """处理文档主流程"""
-    text = read_docx(input_file)
-    chunks = split_text(text)
-
-    for i, chunk in enumerate(chunks, 1):
-        print(f"正在处理第{i}个段落...")
-        try:
-            response = ask_llm(chunk)
-            paragraphs = process_llm_response(response)
-            for para in paragraphs:
-                save_to_txt(para, output_file)
-        except Exception as e:
-            save_to_txt(f"段落{i}处理失败: {str(e)}", output_file)
-
-    print(f"处理完成，结果已保存到 {output_file}")
-
-
 def read_docx(file_path):
    """读取docx文件内容"""
-    from docx import Document
    try:
-        doc = Document(file_path)
-        return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
+        doc = docx.Document(file_path)
+        return "\n".join([para.text for para in doc.paragraphs if para.text])
    except Exception as e:
        print(f"读取docx文件出错: {str(e)}")
        return ""

-
 def save_to_txt(content, file_path, mode='a'):
-    """将内容保存到txt文件"""
+    """保存内容到txt文件"""
    try:
        with open(file_path, mode, encoding='utf-8') as f:
-            f.write(content + '\n\n')
+            f.write(content + "\n")
+        return True
    except Exception as e:
-        print(f"保存到txt文件出错: {str(e)}")
+        print(f"保存文件出错: {str(e)}")
+        return False

+def split_text(text, chunk_size=6000):
+    """按约6000字符分割文本，优先在段落结束处分隔"""
+    chunks = []
+    current_chunk = ""
+    
+    paragraphs = text.split('\n\n')
+    for para in paragraphs:
+        if len(current_chunk) + len(para) > chunk_size and current_chunk:
+            chunks.append(current_chunk)
+            current_chunk = para
+        else:
+            if current_chunk:
+                current_chunk += '\n\n' + para
+            else:
+                current_chunk = para
+    
+    if current_chunk:
+        chunks.append(current_chunk)
+    
+    return chunks

-# 在process_document方法中调用时，请确保output_file参数是完整的文件路径
-
-
-def call_deepseek_api(prompt, stream_callback=None):
-    """流式调用DeepSeek API"""
-    try:
-        response = client.chat.completions.create(
-            model="deepseek-chat",
-            messages=[
-                {"role": "system", "content": "你是一个专业的文档分析助手"},
-                {"role": "user", "content": prompt}
-            ],
-            temperature=0.3,
-            stream=True
-        )
-
-        full_response = ""
-        for chunk in response:
-            if chunk.choices[0].delta.content:
-                content = chunk.choices[0].delta.content
-                full_response += content
-                if stream_callback:
-                    stream_callback(content)
-
-        return full_response
-    except Exception as e:
-        print(f"调用DeepSeek API出错: {str(e)}")
-
+PROMPT_TEMPLATE = """
+请分析以下数学教学内容，直接返回处理后的文本：
+1. 根据每个段落间的逻辑关系，判断是不是强相关，一致内容的划分为同一个段落，否则视为两个段落。 
+2. 不同段落间用两个换行符分隔
+3. 不要添加任何额外格式或标记，绝对不要使用markdown格式返回。 

-def process_llm_response(response):
-    """处理大模型的段落划分响应"""
-    paragraphs = []
-    current_para = ""
-    for line in response.split('\n'):
-        if line.startswith('## '):
-            if current_para:
-                paragraphs.append(current_para.strip())
-            current_para = line[3:] + '\n'  # 去掉##标记
-        else:
-            current_para += line + '\n'
-    if current_para:
-        paragraphs.append(current_para.strip())
-    return paragraphs
+待处理内容：
+{text_chunk}
+"""

+def ask_llm(text_chunk):
+    """向大模型提问并获取响应"""
+    prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk)
+    return call_qwen_plus(prompt)
+
+def process_document(input_path, output_dir):
+    """处理文档主函数"""
+    text = read_docx(input_path)
+    if not text:
+        print("无法读取输入文件内容")
+        return False
+    
+    # 确保输出目录存在
+    os.makedirs(output_dir, exist_ok=True)
+    
+    chunks = split_text(text)
+    for i, chunk in enumerate(chunks):
+        print(f"正在处理第{i+1}/{len(chunks)}个分块...")
+        response = ask_llm(chunk)
+        if response:
+            output_file = os.path.join(output_dir, f"{i+1}.txt")
+            save_to_txt(response, output_file, mode='w')
+    
+    print(f"处理完成，结果已保存到目录: {output_dir}")
+    return True

 if __name__ == "__main__":
    input_file = '../Txt/小学数学（史校长）.docx'
-    output_file = '../Txt/小学数学（史校长）.txt'
-    process_document(input_file, output_file)
+    output_dir = '../Txt/processed_chunks'  # 改为输出目录
+    process_document(input_file, output_dir)