|
|
|
@ -1,154 +1,117 @@
|
|
|
|
|
from Config.Config import MODEL_API_KEY, MODEL_NAME
|
|
|
|
|
from openai import OpenAI
|
|
|
|
|
from Config import Config
|
|
|
|
|
import docx
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
# 初始化DeepSeek客户端
|
|
|
|
|
# 初始化通义千问客户端
|
|
|
|
|
client = OpenAI(
|
|
|
|
|
api_key=Config.DEEPSEEK_API_KEY,
|
|
|
|
|
base_url=Config.DEEPSEEK_URL
|
|
|
|
|
api_key=MODEL_API_KEY,
|
|
|
|
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def call_deepseek(prompt):
|
|
|
|
|
"""调用DeepSeek API"""
|
|
|
|
|
def call_qwen_plus(prompt, stream_callback=None):
|
|
|
|
|
"""调用通义千问API"""
|
|
|
|
|
try:
|
|
|
|
|
response = client.chat.completions.create(
|
|
|
|
|
model="deepseek-chat",
|
|
|
|
|
model=MODEL_NAME,
|
|
|
|
|
messages=[
|
|
|
|
|
{"role": "system", "content": "你是一个专业的文档分析助手"},
|
|
|
|
|
{"role": "user", "content": prompt}
|
|
|
|
|
],
|
|
|
|
|
temperature=0.3
|
|
|
|
|
temperature=0.3,
|
|
|
|
|
stream=True
|
|
|
|
|
)
|
|
|
|
|
return response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
full_response = ""
|
|
|
|
|
for chunk in response:
|
|
|
|
|
content = chunk.choices[0].delta.content
|
|
|
|
|
if content:
|
|
|
|
|
full_response += content
|
|
|
|
|
if stream_callback:
|
|
|
|
|
stream_callback(content)
|
|
|
|
|
|
|
|
|
|
return full_response
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"调用DeepSeek API出错: {str(e)}")
|
|
|
|
|
print(f"调用通义千问API出错: {str(e)}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def ask_llm(text_chunk):
|
|
|
|
|
"""向大模型提问并获取响应"""
|
|
|
|
|
PROMPT_TEMPLATE = """
|
|
|
|
|
请分析以下数学教学内容,直接返回处理后的文本:
|
|
|
|
|
1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。
|
|
|
|
|
2. 不同段落间用两个换行符分隔
|
|
|
|
|
3. 不要添加任何额外格式或标记,绝对不要使用markdown格式返回。
|
|
|
|
|
|
|
|
|
|
待处理内容:
|
|
|
|
|
{text_chunk}
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk)
|
|
|
|
|
return call_deepseek(prompt)
|
|
|
|
|
|
|
|
|
|
# 修改后的处理函数
|
|
|
|
|
def process_response(response):
|
|
|
|
|
# 直接返回原始响应内容,不做格式处理
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_text(text, chunk_size=6000):
|
|
|
|
|
"""按段落分割文本,确保每个块接近6000字"""
|
|
|
|
|
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
|
|
|
|
|
chunks = []
|
|
|
|
|
current_chunk = []
|
|
|
|
|
current_length = 0
|
|
|
|
|
|
|
|
|
|
for para in paragraphs:
|
|
|
|
|
para_length = len(para)
|
|
|
|
|
if current_length + para_length > chunk_size and current_chunk:
|
|
|
|
|
chunks.append('\n'.join(current_chunk))
|
|
|
|
|
current_chunk = []
|
|
|
|
|
current_length = 0
|
|
|
|
|
current_chunk.append(para)
|
|
|
|
|
current_length += para_length
|
|
|
|
|
|
|
|
|
|
if current_chunk:
|
|
|
|
|
chunks.append('\n'.join(current_chunk))
|
|
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_document(input_file, output_file):
|
|
|
|
|
"""处理文档主流程"""
|
|
|
|
|
text = read_docx(input_file)
|
|
|
|
|
chunks = split_text(text)
|
|
|
|
|
|
|
|
|
|
for i, chunk in enumerate(chunks, 1):
|
|
|
|
|
print(f"正在处理第{i}个段落...")
|
|
|
|
|
try:
|
|
|
|
|
response = ask_llm(chunk)
|
|
|
|
|
paragraphs = process_llm_response(response)
|
|
|
|
|
for para in paragraphs:
|
|
|
|
|
save_to_txt(para, output_file)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
save_to_txt(f"段落{i}处理失败: {str(e)}", output_file)
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,结果已保存到 {output_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_docx(file_path):
|
|
|
|
|
"""读取docx文件内容"""
|
|
|
|
|
from docx import Document
|
|
|
|
|
try:
|
|
|
|
|
doc = Document(file_path)
|
|
|
|
|
return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
|
|
|
|
|
doc = docx.Document(file_path)
|
|
|
|
|
return "\n".join([para.text for para in doc.paragraphs if para.text])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"读取docx文件出错: {str(e)}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_txt(content, file_path, mode='a'):
|
|
|
|
|
"""将内容保存到txt文件"""
|
|
|
|
|
"""保存内容到txt文件"""
|
|
|
|
|
try:
|
|
|
|
|
with open(file_path, mode, encoding='utf-8') as f:
|
|
|
|
|
f.write(content + '\n\n')
|
|
|
|
|
f.write(content + "\n")
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"保存到txt文件出错: {str(e)}")
|
|
|
|
|
print(f"保存文件出错: {str(e)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def split_text(text, chunk_size=6000):
|
|
|
|
|
"""按约6000字符分割文本,优先在段落结束处分隔"""
|
|
|
|
|
chunks = []
|
|
|
|
|
current_chunk = ""
|
|
|
|
|
|
|
|
|
|
paragraphs = text.split('\n\n')
|
|
|
|
|
for para in paragraphs:
|
|
|
|
|
if len(current_chunk) + len(para) > chunk_size and current_chunk:
|
|
|
|
|
chunks.append(current_chunk)
|
|
|
|
|
current_chunk = para
|
|
|
|
|
else:
|
|
|
|
|
if current_chunk:
|
|
|
|
|
current_chunk += '\n\n' + para
|
|
|
|
|
else:
|
|
|
|
|
current_chunk = para
|
|
|
|
|
|
|
|
|
|
if current_chunk:
|
|
|
|
|
chunks.append(current_chunk)
|
|
|
|
|
|
|
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
|
# 在process_document方法中调用时,请确保output_file参数是完整的文件路径
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_deepseek_api(prompt, stream_callback=None):
|
|
|
|
|
"""流式调用DeepSeek API"""
|
|
|
|
|
try:
|
|
|
|
|
response = client.chat.completions.create(
|
|
|
|
|
model="deepseek-chat",
|
|
|
|
|
messages=[
|
|
|
|
|
{"role": "system", "content": "你是一个专业的文档分析助手"},
|
|
|
|
|
{"role": "user", "content": prompt}
|
|
|
|
|
],
|
|
|
|
|
temperature=0.3,
|
|
|
|
|
stream=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
full_response = ""
|
|
|
|
|
for chunk in response:
|
|
|
|
|
if chunk.choices[0].delta.content:
|
|
|
|
|
content = chunk.choices[0].delta.content
|
|
|
|
|
full_response += content
|
|
|
|
|
if stream_callback:
|
|
|
|
|
stream_callback(content)
|
|
|
|
|
|
|
|
|
|
return full_response
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"调用DeepSeek API出错: {str(e)}")
|
|
|
|
|
|
|
|
|
|
PROMPT_TEMPLATE = """
|
|
|
|
|
请分析以下数学教学内容,直接返回处理后的文本:
|
|
|
|
|
1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。
|
|
|
|
|
2. 不同段落间用两个换行符分隔
|
|
|
|
|
3. 不要添加任何额外格式或标记,绝对不要使用markdown格式返回。
|
|
|
|
|
|
|
|
|
|
def process_llm_response(response):
|
|
|
|
|
"""处理大模型的段落划分响应"""
|
|
|
|
|
paragraphs = []
|
|
|
|
|
current_para = ""
|
|
|
|
|
for line in response.split('\n'):
|
|
|
|
|
if line.startswith('## '):
|
|
|
|
|
if current_para:
|
|
|
|
|
paragraphs.append(current_para.strip())
|
|
|
|
|
current_para = line[3:] + '\n' # 去掉##标记
|
|
|
|
|
else:
|
|
|
|
|
current_para += line + '\n'
|
|
|
|
|
if current_para:
|
|
|
|
|
paragraphs.append(current_para.strip())
|
|
|
|
|
return paragraphs
|
|
|
|
|
待处理内容:
|
|
|
|
|
{text_chunk}
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def ask_llm(text_chunk):
|
|
|
|
|
"""向大模型提问并获取响应"""
|
|
|
|
|
prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk)
|
|
|
|
|
return call_qwen_plus(prompt)
|
|
|
|
|
|
|
|
|
|
def process_document(input_path, output_dir):
|
|
|
|
|
"""处理文档主函数"""
|
|
|
|
|
text = read_docx(input_path)
|
|
|
|
|
if not text:
|
|
|
|
|
print("无法读取输入文件内容")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 确保输出目录存在
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
chunks = split_text(text)
|
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
|
|
|
print(f"正在处理第{i+1}/{len(chunks)}个分块...")
|
|
|
|
|
response = ask_llm(chunk)
|
|
|
|
|
if response:
|
|
|
|
|
output_file = os.path.join(output_dir, f"{i+1}.txt")
|
|
|
|
|
save_to_txt(response, output_file, mode='w')
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,结果已保存到目录: {output_dir}")
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
input_file = '../Txt/小学数学(史校长).docx'
|
|
|
|
|
output_file = '../Txt/小学数学(史校长).txt'
|
|
|
|
|
process_document(input_file, output_file)
|
|
|
|
|
output_dir = '../Txt/processed_chunks' # 改为输出目录
|
|
|
|
|
process_document(input_file, output_dir)
|
|
|
|
|