You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
4.9 KiB

1 month ago
from openai import OpenAI
from Config import Config
1 month ago
# 初始化DeepSeek客户端
client = OpenAI(
api_key=Config.DEEPSEEK_API_KEY,
base_url=Config.DEEPSEEK_URL
)
def call_deepseek(prompt):
"""调用DeepSeek API"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3
1 month ago
)
1 month ago
return response.choices[0].message.content
except Exception as e:
print(f"调用DeepSeek API出错: {str(e)}")
return ""
1 month ago
1 month ago
def ask_llm(text_chunk):
"""向大模型提问并获取响应"""
1 month ago
PROMPT_TEMPLATE = """
请分析以下数学教学内容直接返回处理后的文本
1. 根据每个段落间的逻辑关系判断是不是强相关一致内容的划分为同一个段落否则视为两个段落
2. 不同段落间用两个换行符分隔
3. 不要添加任何额外格式或标记绝对不要使用markdown格式返回
待处理内容
{text_chunk}
"""
prompt = PROMPT_TEMPLATE.format(text_chunk=text_chunk)
1 month ago
return call_deepseek(prompt)
1 month ago
1 month ago
# 修改后的处理函数
def process_response(response):
# 直接返回原始响应内容,不做格式处理
return response
1 month ago
def split_text(text, chunk_size=6000):
"""按段落分割文本确保每个块接近6000字"""
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
chunks = []
current_chunk = []
current_length = 0
1 month ago
1 month ago
for para in paragraphs:
para_length = len(para)
if current_length + para_length > chunk_size and current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(para)
current_length += para_length
1 month ago
1 month ago
if current_chunk:
chunks.append('\n'.join(current_chunk))
return chunks
def process_document(input_file, output_file):
"""处理文档主流程"""
text = read_docx(input_file)
chunks = split_text(text)
1 month ago
1 month ago
for i, chunk in enumerate(chunks, 1):
print(f"正在处理第{i}个段落...")
try:
1 month ago
response = ask_llm(chunk)
paragraphs = process_llm_response(response)
1 month ago
for para in paragraphs:
1 month ago
save_to_txt(para, output_file)
1 month ago
except Exception as e:
save_to_txt(f"段落{i}处理失败: {str(e)}", output_file)
1 month ago
1 month ago
print(f"处理完成,结果已保存到 {output_file}")
def read_docx(file_path):
"""读取docx文件内容"""
from docx import Document
try:
doc = Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return ""
1 month ago
1 month ago
def save_to_txt(content, file_path, mode='a'):
"""将内容保存到txt文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content + '\n\n')
except Exception as e:
print(f"保存到txt文件出错: {str(e)}")
1 month ago
1 month ago
# 在process_document方法中调用时请确保output_file参数是完整的文件路径
def call_deepseek_api(prompt, stream_callback=None):
"""流式调用DeepSeek API"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=True
)
1 month ago
1 month ago
full_response = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_response += content
if stream_callback:
stream_callback(content)
1 month ago
1 month ago
return full_response
except Exception as e:
print(f"调用DeepSeek API出错: {str(e)}")
1 month ago
def process_llm_response(response):
"""处理大模型的段落划分响应"""
paragraphs = []
current_para = ""
for line in response.split('\n'):
if line.startswith('## '):
if current_para:
paragraphs.append(current_para.strip())
current_para = line[3:] + '\n' # 去掉##标记
else:
current_para += line + '\n'
if current_para:
paragraphs.append(current_para.strip())
return paragraphs
1 month ago
if __name__ == "__main__":
input_file = '../Txt/小学数学(史校长).docx'
1 month ago
output_file = '../Txt/小学数学(史校长).txt'
process_document(input_file, output_file)