diff --git a/dsRag/Tools/T4_DocxProcessor.py b/dsRag/Tools/T4_DocxProcessor.py index 56144d95..67a9fc7b 100644 --- a/dsRag/Tools/T4_DocxProcessor.py +++ b/dsRag/Tools/T4_DocxProcessor.py @@ -1,154 +1,117 @@ +from Config.Config import MODEL_API_KEY, MODEL_NAME from openai import OpenAI -from Config import Config +import docx +import os -# 初始化DeepSeek客户端 +# 初始化通义千问客户端 client = OpenAI( - api_key=Config.DEEPSEEK_API_KEY, - base_url=Config.DEEPSEEK_URL + api_key=MODEL_API_KEY, + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) -def call_deepseek(prompt): - """调用DeepSeek API""" +def call_qwen_plus(prompt, stream_callback=None): + """调用通义千问API""" try: response = client.chat.completions.create( - model="deepseek-chat", + model=MODEL_NAME, messages=[ {"role": "system", "content": "你是一个专业的文档分析助手"}, {"role": "user", "content": prompt} ], - temperature=0.3 + temperature=0.3, + stream=True ) - return response.choices[0].message.content + + full_response = "" + for chunk in response: + content = chunk.choices[0].delta.content + if content: + full_response += content + if stream_callback: + stream_callback(content) + + return full_response except Exception as e: - print(f"调用DeepSeek API出错: {str(e)}") + print(f"调用通义千问API出错: {str(e)}") return "" -def ask_llm(text_chunk): - """向大模型提问并获取响应""" - PROMPT_TEMPLATE = """ - 请分析以下数学教学内容,直接返回处理后的文本: - 1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。 - 2. 不同段落间用两个换行符分隔 - 3. 不要添加任何额外格式或标记,绝对不要使用markdown格式返回。 - - 待处理内容: - {text_chunk} - """ - - prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk) - return call_deepseek(prompt) - -# 修改后的处理函数 -def process_response(response): - # 直接返回原始响应内容,不做格式处理 - return response - - -def split_text(text, chunk_size=6000): - """按段落分割文本,确保每个块接近6000字""" - paragraphs = [p.strip() for p in text.split('\n') if p.strip()] - chunks = [] - current_chunk = [] - current_length = 0 - - for para in paragraphs: - para_length = len(para) - if current_length + para_length > chunk_size and current_chunk: - chunks.append('\n'.join(current_chunk)) - current_chunk = [] - current_length = 0 - current_chunk.append(para) - current_length += para_length - - if current_chunk: - chunks.append('\n'.join(current_chunk)) - return chunks - - -def process_document(input_file, output_file): - """处理文档主流程""" - text = read_docx(input_file) - chunks = split_text(text) - - for i, chunk in enumerate(chunks, 1): - print(f"正在处理第{i}个段落...") - try: - response = ask_llm(chunk) - paragraphs = process_llm_response(response) - for para in paragraphs: - save_to_txt(para, output_file) - except Exception as e: - save_to_txt(f"段落{i}处理失败: {str(e)}", output_file) - - print(f"处理完成,结果已保存到 {output_file}") - - def read_docx(file_path): """读取docx文件内容""" - from docx import Document try: - doc = Document(file_path) - return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()]) + doc = docx.Document(file_path) + return "\n".join([para.text for para in doc.paragraphs if para.text]) except Exception as e: print(f"读取docx文件出错: {str(e)}") return "" - def save_to_txt(content, file_path, mode='a'): - """将内容保存到txt文件""" + """保存内容到txt文件""" try: with open(file_path, mode, encoding='utf-8') as f: - f.write(content + '\n\n') + f.write(content + "\n") + return True except Exception as e: - print(f"保存到txt文件出错: {str(e)}") + print(f"保存文件出错: {str(e)}") + return False +def split_text(text, chunk_size=6000): + """按约6000字符分割文本,优先在段落结束处分隔""" + chunks = [] + current_chunk = "" + + paragraphs = text.split('\n\n') + for para in paragraphs: + if len(current_chunk) + len(para) > chunk_size and current_chunk: + chunks.append(current_chunk) + current_chunk = para + else: + if current_chunk: + current_chunk += '\n\n' + para + else: + current_chunk = para + + if current_chunk: + chunks.append(current_chunk) + + return chunks -# 在process_document方法中调用时,请确保output_file参数是完整的文件路径 - - -def call_deepseek_api(prompt, stream_callback=None): - """流式调用DeepSeek API""" - try: - response = client.chat.completions.create( - model="deepseek-chat", - messages=[ - {"role": "system", "content": "你是一个专业的文档分析助手"}, - {"role": "user", "content": prompt} - ], - temperature=0.3, - stream=True - ) - - full_response = "" - for chunk in response: - if chunk.choices[0].delta.content: - content = chunk.choices[0].delta.content - full_response += content - if stream_callback: - stream_callback(content) - - return full_response - except Exception as e: - print(f"调用DeepSeek API出错: {str(e)}") - +PROMPT_TEMPLATE = """ +请分析以下数学教学内容,直接返回处理后的文本: +1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。 +2. 不同段落间用两个换行符分隔 +3. 不要添加任何额外格式或标记,绝对不要使用markdown格式返回。 -def process_llm_response(response): - """处理大模型的段落划分响应""" - paragraphs = [] - current_para = "" - for line in response.split('\n'): - if line.startswith('## '): - if current_para: - paragraphs.append(current_para.strip()) - current_para = line[3:] + '\n' # 去掉##标记 - else: - current_para += line + '\n' - if current_para: - paragraphs.append(current_para.strip()) - return paragraphs +待处理内容: +{text_chunk} +""" +def ask_llm(text_chunk): + """向大模型提问并获取响应""" + prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk) + return call_qwen_plus(prompt) + +def process_document(input_path, output_dir): + """处理文档主函数""" + text = read_docx(input_path) + if not text: + print("无法读取输入文件内容") + return False + + # 确保输出目录存在 + os.makedirs(output_dir, exist_ok=True) + + chunks = split_text(text) + for i, chunk in enumerate(chunks): + print(f"正在处理第{i+1}/{len(chunks)}个分块...") + response = ask_llm(chunk) + if response: + output_file = os.path.join(output_dir, f"{i+1}.txt") + save_to_txt(response, output_file, mode='w') + + print(f"处理完成,结果已保存到目录: {output_dir}") + return True if __name__ == "__main__": input_file = '../Txt/小学数学(史校长).docx' - output_file = '../Txt/小学数学(史校长).txt' - process_document(input_file, output_file) + output_dir = '../Txt/processed_chunks' # 改为输出目录 + process_document(input_file, output_dir)