dsProject/dsRag/Tools/T4_DocxProcessor.py

from Config.Config import MODEL_API_KEY, MODEL_NAME
from openai import OpenAI
import docx
import os

# 初始化通义千问客户端
client = OpenAI(
    api_key=MODEL_API_KEY,
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

def call_qwen_plus(prompt, stream_callback=None):
    """调用通义千问API"""
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "你是一个专业的文档分析助手"},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            stream=True
        )
        
        full_response = ""
        for chunk in response:
            content = chunk.choices[0].delta.content
            if content:
                full_response += content
                if stream_callback:
                    stream_callback(content)
        
        return full_response
    except Exception as e:
        print(f"调用通义千问API出错: {str(e)}")
        return ""

def read_docx(file_path):
    """读取docx文件内容"""
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text])
    except Exception as e:
        print(f"读取docx文件出错: {str(e)}")
        return ""

def save_to_txt(content, file_path, mode='a'):
    """保存内容到txt文件"""
    try:
        with open(file_path, mode, encoding='utf-8') as f:
            f.write(content + "\n")
        return True
    except Exception as e:
        print(f"保存文件出错: {str(e)}")
        return False

def split_text(text, chunk_size=6000):
    """按约6000字符分割文本，优先在段落结束处分隔"""
    chunks = []
    current_chunk = ""
    
    paragraphs = text.split('\n\n')
    for para in paragraphs:
        if len(current_chunk) + len(para) > chunk_size and current_chunk:
            chunks.append(current_chunk)
            current_chunk = para
        else:
            if current_chunk:
                current_chunk += '\n\n' + para
            else:
                current_chunk = para
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

PROMPT_TEMPLATE = """
请分析以下数学教学内容，直接返回处理后的文本：
1. 根据每个段落间的逻辑关系，判断是不是强相关，一致内容的划分为同一个段落，否则视为两个段落。 
2. 不同段落间用两个换行符分隔
3. 不要添加任何额外格式或标记，绝对不要使用markdown格式返回。 

待处理内容：
{text_chunk}
"""

def ask_llm(text_chunk, is_final=False):
    """调用大模型并处理响应"""
    prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk, is_final=is_final)
    
    def stream_callback(chunk):
        """SSE流式回调函数"""
        print(chunk, end='', flush=True)
    
    try:
        response = call_qwen_plus(prompt, stream_callback=stream_callback)
        return response
    except Exception as e:
        print(f"调用大模型出错: {str(e)}")
        return None

def process_document(input_path, output_dir):
    """处理文档主函数"""
    text = read_docx(input_path)
    if not text:
        print("无法读取输入文件内容")
        return False
    
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    chunks = split_text(text)
    for i, chunk in enumerate(chunks):
        print(f"正在处理第{i+1}/{len(chunks)}个分块...")
        response = ask_llm(chunk)
        if response:
            output_file = os.path.join(output_dir, f"{i+1}.txt")
            save_to_txt(response, output_file, mode='w')
    
    print(f"处理完成，结果已保存到目录: {output_dir}")
    return True

if __name__ == "__main__":
    input_file = '../Txt/小学数学（史校长）.docx'
    output_dir = '../Txt/processed_chunks'  # 改为输出目录
    process_document(input_file, output_dir)
-												'commit'

											
										
										
											1 month ago
+								from Config.Config import MODEL_API_KEY, MODEL_NAME
-												'commit'

											
										
										
											1 month ago
+								from openai import OpenAI
-												'commit'

											
										
										
											1 month ago
+								import docx
 								import os
-												'commit'

											
										
										
											1 month ago
-												'commit'

											
										
										
											1 month ago
+								# 初始化通义千问客户端
-												'commit'

											
										
										
											1 month ago
+								client = OpenAI(
-												'commit'

											
										
										
											1 month ago
+								    api_key=MODEL_API_KEY,
 								    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
-												'commit'

											
										
										
											1 month ago
+								)
-												'commit'

											
										
										
											1 month ago
+								def call_qwen_plus(prompt, stream_callback=None):
 								    """调用通义千问API"""
-												'commit'

											
										
										
											1 month ago
+								    try:
 								        response = client.chat.completions.create(
-												'commit'

											
										
										
											1 month ago
+								            model=MODEL_NAME,
-												'commit'

											
										
										
											1 month ago
+								            messages=[
 								                {"role": "system", "content": "你是一个专业的文档分析助手"},
 								                {"role": "user", "content": prompt}
 								            ],
-												'commit'

											
										
										
											1 month ago
+								            temperature=0.3,
 								            stream=True
-												'commit'

											
										
										
											1 month ago
+								        )
-												'commit'

											
										
										
											1 month ago
 								        full_response = ""
 								        for chunk in response:
 								            content = chunk.choices[0].delta.content
 								            if content:
 								                full_response += content
 								                if stream_callback:
 								                    stream_callback(content)
 								        return full_response
-												'commit'

											
										
										
											1 month ago
+								    except Exception as e:
-												'commit'

											
										
										
											1 month ago
+								        print(f"调用通义千问API出错: {str(e)}")
-												'commit'

											
										
										
											1 month ago
+								        return ""
-												'commit'

											
										
										
											1 month ago
 								def read_docx(file_path):
 								    """读取docx文件内容"""
 								    try:
-												'commit'

											
										
										
											1 month ago
+								        doc = docx.Document(file_path)
 								        return "\n".join([para.text for para in doc.paragraphs if para.text])
-												'commit'

											
										
										
											1 month ago
+								    except Exception as e:
 								        print(f"读取docx文件出错: {str(e)}")
 								        return ""
 								def save_to_txt(content, file_path, mode='a'):
-												'commit'

											
										
										
											1 month ago
+								    """保存内容到txt文件"""
-												'commit'

											
										
										
											1 month ago
+								    try:
 								        with open(file_path, mode, encoding='utf-8') as f:
-												'commit'

											
										
										
											1 month ago
+								            f.write(content + "\n")
 								        return True
-												'commit'

											
										
										
											1 month ago
+								    except Exception as e:
-												'commit'

											
										
										
											1 month ago
+								        print(f"保存文件出错: {str(e)}")
 								        return False
-												'commit'

											
										
										
											1 month ago
-												'commit'

											
										
										
											1 month ago
+								def split_text(text, chunk_size=6000):
 								    """按约6000字符分割文本，优先在段落结束处分隔"""
 								    chunks = []
 								    current_chunk = ""
 								    paragraphs = text.split('\n\n')
 								    for para in paragraphs:
 								        if len(current_chunk) + len(para) > chunk_size and current_chunk:
 								            chunks.append(current_chunk)
 								            current_chunk = para
 								        else:
 								            if current_chunk:
 								                current_chunk += '\n\n' + para
 								            else:
 								                current_chunk = para
 								    if current_chunk:
 								        chunks.append(current_chunk)
 								    return chunks
-												'commit'

											
										
										
											1 month ago
-												'commit'

											
										
										
											1 month ago
+								PROMPT_TEMPLATE = """
 								请分析以下数学教学内容，直接返回处理后的文本：
 . 根据每个段落间的逻辑关系，判断是不是强相关，一致内容的划分为同一个段落，否则视为两个段落。
 . 不同段落间用两个换行符分隔
 . 不要添加任何额外格式或标记，绝对不要使用markdown格式返回。
-												'commit'

											
										
										
											1 month ago
-												'commit'

											
										
										
											1 month ago
+								待处理内容：
 								{text_chunk}
 								"""
-												'commit'

											
										
										
											1 month ago
-												'commit'

											
										
										
											1 month ago
+								def ask_llm(text_chunk, is_final=False):
 								    """调用大模型并处理响应"""
 								    prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk, is_final=is_final)
 								    def stream_callback(chunk):
 								        """SSE流式回调函数"""
 								        print(chunk, end='', flush=True)
 								    try:
 								        response = call_qwen_plus(prompt, stream_callback=stream_callback)
 								        return response
 								    except Exception as e:
 								        print(f"调用大模型出错: {str(e)}")
 								        return None
-												'commit'

											
										
										
											1 month ago
 								def process_document(input_path, output_dir):
 								    """处理文档主函数"""
 								    text = read_docx(input_path)
 								    if not text:
 								        print("无法读取输入文件内容")
 								        return False
 								    # 确保输出目录存在
 								    os.makedirs(output_dir, exist_ok=True)
 								    chunks = split_text(text)
 								    for i, chunk in enumerate(chunks):
 								        print(f"正在处理第{i+1}/{len(chunks)}个分块...")
 								        response = ask_llm(chunk)
 								        if response:
 								            output_file = os.path.join(output_dir, f"{i+1}.txt")
 								            save_to_txt(response, output_file, mode='w')
 								    print(f"处理完成，结果已保存到目录: {output_dir}")
 								    return True
-												'commit'

											
										
										
											1 month ago
-												'commit'

											
										
										
											1 month ago
+								if __name__ == "__main__":
 								    input_file = '../Txt/小学数学（史校长）.docx'
-												'commit'

											
										
										
											1 month ago
+								    output_dir = '../Txt/processed_chunks'  # 改为输出目录
 								    process_document(input_file, output_dir)