You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
4.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from openai import OpenAI
from Config import Config
# 初始化DeepSeek客户端
client = OpenAI(
api_key=Config.DEEPSEEK_API_KEY,
base_url=Config.DEEPSEEK_URL
)
def call_deepseek(prompt):
"""调用DeepSeek API"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3
)
return response.choices[0].message.content
except Exception as e:
print(f"调用DeepSeek API出错: {str(e)}")
return ""
def ask_llm(text_chunk):
"""向大模型提问并获取响应"""
PROMPT_TEMPLATE = """
请分析以下数学教学内容,直接返回处理后的文本:
1. 根据每个段落间的逻辑关系,判断是不是强相关,一致内容的划分为同一个段落,否则视为两个段落。
2. 不同段落间用两个换行符分隔
3. 不要添加任何额外格式或标记绝对不要使用markdown格式返回。
待处理内容:
{text_chunk}
"""
prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk)
return call_deepseek(prompt)
# 修改后的处理函数
def process_response(response):
# 直接返回原始响应内容,不做格式处理
return response
def split_text(text, chunk_size=6000):
"""按段落分割文本确保每个块接近6000字"""
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
chunks = []
current_chunk = []
current_length = 0
for para in paragraphs:
para_length = len(para)
if current_length + para_length > chunk_size and current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(para)
current_length += para_length
if current_chunk:
chunks.append('\n'.join(current_chunk))
return chunks
def process_document(input_file, output_file):
"""处理文档主流程"""
text = read_docx(input_file)
chunks = split_text(text)
for i, chunk in enumerate(chunks, 1):
print(f"正在处理第{i}个段落...")
try:
response = ask_llm(chunk)
paragraphs = process_llm_response(response)
for para in paragraphs:
save_to_txt(para, output_file)
except Exception as e:
save_to_txt(f"段落{i}处理失败: {str(e)}", output_file)
print(f"处理完成,结果已保存到 {output_file}")
def read_docx(file_path):
"""读取docx文件内容"""
from docx import Document
try:
doc = Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return ""
def save_to_txt(content, file_path, mode='a'):
"""将内容保存到txt文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content + '\n\n')
except Exception as e:
print(f"保存到txt文件出错: {str(e)}")
# 在process_document方法中调用时请确保output_file参数是完整的文件路径
def call_deepseek_api(prompt, stream_callback=None):
"""流式调用DeepSeek API"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=True
)
full_response = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_response += content
if stream_callback:
stream_callback(content)
return full_response
except Exception as e:
print(f"调用DeepSeek API出错: {str(e)}")
def process_llm_response(response):
"""处理大模型的段落划分响应"""
paragraphs = []
current_para = ""
for line in response.split('\n'):
if line.startswith('## '):
if current_para:
paragraphs.append(current_para.strip())
current_para = line[3:] + '\n' # 去掉##标记
else:
current_para += line + '\n'
if current_para:
paragraphs.append(current_para.strip())
return paragraphs
if __name__ == "__main__":
input_file = '../Txt/小学数学(史校长).docx'
output_file = '../Txt/小学数学(史校长).txt'
process_document(input_file, output_file)