You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

144 lines
4.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from openai import OpenAI
from Config import Config
class DocxProcessor:
def __init__(self):
# 独立初始化DeepSeek客户端
self.client = OpenAI(
api_key=Config.DEEPSEEK_API_KEY,
base_url=Config.DEEPSEEK_URL
)
def call_deepseek(self, prompt):
"""独立调用DeepSeek API"""
try:
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3
)
return response.choices[0].message.content
except Exception as e:
print(f"调用DeepSeek API出错: {str(e)}")
return ""
def ask_llm(self, text_chunk):
"""向大模型提问并获取响应"""
prompt = f"""请分析以下文本段落并返回有价值的内容:
要求:
1. 保持原文关键信息
2. 用清晰的格式返回
3. 可包含简要总结
文本段落内容:
{text_chunk}"""
return self.call_deepseek(prompt)
# 初始化DeepSeek客户端
client = OpenAI(
api_key=Config.DEEPSEEK_API_KEY,
base_url=Config.DEEPSEEK_URL
)
def split_text(text, chunk_size=6000):
"""按段落分割文本确保每个块接近6000字"""
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
chunks = []
current_chunk = []
current_length = 0
for para in paragraphs:
para_length = len(para)
if current_length + para_length > chunk_size and current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(para)
current_length += para_length
if current_chunk:
chunks.append('\n'.join(current_chunk))
return chunks
def ask_llm(text_chunk):
"""向大模型提问并获取响应"""
prompt = f"""请分析以下文本段落并返回有价值的内容:
要求:
1. 保持原文关键信息
2. 用清晰的格式返回
3. 可包含简要总结
文本段落内容:
{text_chunk}"""
return call_deepseek_api(prompt)
def process_document(input_file, output_file):
"""处理文档主流程"""
text = read_docx(input_file)
chunks = split_text(text)
for i, chunk in enumerate(chunks, 1):
print(f"正在处理第{i}个段落...")
try:
response = ask_llm(chunk)
save_to_txt(f"段落{i}响应:\n{response}", output_file)
except Exception as e:
save_to_txt(f"段落{i}处理失败: {str(e)}", output_file)
print(f"处理完成,结果已保存到 {output_file}")
def read_docx(file_path):
"""读取docx文件内容"""
from docx import Document
try:
doc = Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return ""
def save_to_txt(content, file_path, mode='a'):
"""将内容保存到txt文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content + '\n\n')
except Exception as e:
print(f"保存到txt文件出错: {str(e)}")
# 在process_document方法中调用时请确保output_file参数是完整的文件路径
def call_deepseek_api(prompt, stream_callback=None):
"""流式调用DeepSeek API"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个专业的文档分析助手"},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=True
)
full_response = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_response += content
if stream_callback:
stream_callback(content)
return full_response
except Exception as e:
print(f"调用DeepSeek API出错: {str(e)}")
if __name__ == "__main__":
input_file = '../Txt/小学数学(史校长).docx'
output_file ='../Txt/小学数学(史校长).txt'
process_document(input_file, output_file)