'commit'
This commit is contained in:
Binary file not shown.
@@ -21,7 +21,7 @@ mapping = {
|
|||||||
"properties": {
|
"properties": {
|
||||||
"embedding": {
|
"embedding": {
|
||||||
"type": "dense_vector",
|
"type": "dense_vector",
|
||||||
"dims": 200, # embedding维度为200
|
"dims": 1024, # embedding维度为1024
|
||||||
"index": True,
|
"index": True,
|
||||||
"similarity": "l2_norm" # 使用L2距离
|
"similarity": "l2_norm" # 使用L2距离
|
||||||
},
|
},
|
||||||
|
39
dsSchoolBuddy/ElasticSearch/T2_BgeM3.py
Normal file
39
dsSchoolBuddy/ElasticSearch/T2_BgeM3.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_openai import OpenAIEmbeddings
|
||||||
|
from langchain_core.vectorstores import InMemoryVectorStore
|
||||||
|
import os
|
||||||
|
|
||||||
|
from Config.Config import EMBED_MODEL_NAME, EMBED_BASE_URL, EMBED_API_KEY
|
||||||
|
|
||||||
|
# 设置环境变量
|
||||||
|
os.environ["OPENAI_BASE_URL"] = EMBED_BASE_URL
|
||||||
|
os.environ["OPENAI_API_KEY"] = EMBED_API_KEY
|
||||||
|
|
||||||
|
# 加载文档
|
||||||
|
file_path = "../langchain/data/0001.pdf"
|
||||||
|
loader = PyPDFLoader(file_path)
|
||||||
|
docs = loader.load()
|
||||||
|
print(f"文档页数:{len(docs)} 页")
|
||||||
|
|
||||||
|
# 切割文档
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=500, chunk_overlap=100, add_start_index=True
|
||||||
|
)
|
||||||
|
all_splits = text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
len(all_splits)
|
||||||
|
|
||||||
|
# 嵌入模型
|
||||||
|
embeddings = OpenAIEmbeddings(model=EMBED_MODEL_NAME)
|
||||||
|
|
||||||
|
# 向量存储
|
||||||
|
vector_store = InMemoryVectorStore(embeddings)
|
||||||
|
ids = vector_store.add_documents(documents=all_splits)
|
||||||
|
|
||||||
|
# 向量查询
|
||||||
|
results = vector_store.similarity_search(
|
||||||
|
"混凝土"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(results[0])
|
@@ -1,191 +0,0 @@
|
|||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import warnings
|
|
||||||
import zipfile
|
|
||||||
|
|
||||||
from docx import Document
|
|
||||||
from docx.oxml.ns import nsmap
|
|
||||||
|
|
||||||
from Util import DocxUtil
|
|
||||||
|
|
||||||
# 抑制HTTPS相关警告
|
|
||||||
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
|
|
||||||
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
|
|
||||||
|
|
||||||
|
|
||||||
def extract_images_from_docx(docx_path, output_folder):
|
|
||||||
"""
|
|
||||||
从docx提取图片并记录位置
|
|
||||||
:param docx_path: Word文档路径
|
|
||||||
:param output_folder: 图片输出文件夹
|
|
||||||
:return: 包含图片路径和位置的列表
|
|
||||||
"""
|
|
||||||
# 从docx_path 的名称示例:小学数学教学中的若干问题_MATH_1.docx
|
|
||||||
# 则图片的前缀统一为 MATH_1_?.docx ,其中 ? 为数字,表示图片的序号
|
|
||||||
# 先获取到前缀
|
|
||||||
a = docx_path.split("_")
|
|
||||||
prefix = a[1] + "_" + a[2].split(".")[0]
|
|
||||||
# print(f"图片前缀为:{prefix}")
|
|
||||||
# 创建一个List<String> 记录每个图片的名称和序号
|
|
||||||
image_data = []
|
|
||||||
# 创建临时解压目录
|
|
||||||
temp_dir = os.path.join(output_folder, "temp_docx")
|
|
||||||
os.makedirs(temp_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# 解压docx文件
|
|
||||||
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall(temp_dir)
|
|
||||||
|
|
||||||
# 读取主文档关系
|
|
||||||
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
|
|
||||||
rels_content = rels_file.read()
|
|
||||||
|
|
||||||
# 加载主文档
|
|
||||||
doc = Document(docx_path)
|
|
||||||
img_counter = 1
|
|
||||||
|
|
||||||
# 遍历所有段落
|
|
||||||
for para_idx, paragraph in enumerate(doc.paragraphs):
|
|
||||||
for run_idx, run in enumerate(paragraph.runs):
|
|
||||||
# 检查运行中的图形
|
|
||||||
for element in run._element:
|
|
||||||
if element.tag.endswith('drawing'):
|
|
||||||
# 提取图片关系ID
|
|
||||||
blip = element.find('.//a:blip', namespaces=nsmap)
|
|
||||||
if blip is not None:
|
|
||||||
embed_id = blip.get('{%s}embed' % nsmap['r'])
|
|
||||||
|
|
||||||
# 从关系文件中获取图片文件名
|
|
||||||
rel_entry = f'<Relationship Id="{embed_id}"'
|
|
||||||
if rel_entry in rels_content:
|
|
||||||
start = rels_content.find(rel_entry)
|
|
||||||
target_start = rels_content.find('Target="', start) + 8
|
|
||||||
target_end = rels_content.find('"', target_start)
|
|
||||||
image_path = rels_content[target_start:target_end]
|
|
||||||
|
|
||||||
# 构建图片源路径
|
|
||||||
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
|
|
||||||
|
|
||||||
if os.path.exists(src_path):
|
|
||||||
# 创建输出文件名
|
|
||||||
ext = os.path.splitext(src_path)[1]
|
|
||||||
fileName = prefix + "_" + str(img_counter)
|
|
||||||
img_name = f"{fileName}{ext}"
|
|
||||||
image_data.append(img_name)
|
|
||||||
dest_path = os.path.join(output_folder, img_name)
|
|
||||||
# 复制图片
|
|
||||||
shutil.copy(src_path, dest_path)
|
|
||||||
|
|
||||||
img_counter += 1
|
|
||||||
|
|
||||||
# 清理临时目录
|
|
||||||
shutil.rmtree(temp_dir)
|
|
||||||
return image_data
|
|
||||||
|
|
||||||
|
|
||||||
def split_into_blocks(text):
|
|
||||||
"""按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容"""
|
|
||||||
blocks = []
|
|
||||||
current_block = []
|
|
||||||
in_block = False
|
|
||||||
|
|
||||||
for line in text.splitlines():
|
|
||||||
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
|
|
||||||
if in_block:
|
|
||||||
blocks.append('\n'.join(current_block))
|
|
||||||
current_block = []
|
|
||||||
in_block = True
|
|
||||||
# 循环移除问题和话题前缀后的数字
|
|
||||||
if line and line.startswith(('问题', '话题')):
|
|
||||||
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
|
|
||||||
if line.startswith(('问题', '话题')):
|
|
||||||
line = line[2:] if len(line) > 2 else line
|
|
||||||
elif line and line[0].isdigit():
|
|
||||||
line = line[1:]
|
|
||||||
line = line.strip()
|
|
||||||
if in_block and line: # 只添加非空行
|
|
||||||
current_block.append(line)
|
|
||||||
|
|
||||||
if current_block:
|
|
||||||
blocks.append('\n'.join(current_block))
|
|
||||||
|
|
||||||
return [(i + 1, block) for i, block in enumerate(blocks)]
|
|
||||||
|
|
||||||
|
|
||||||
def save_to_txt(content, file_path, mode='w'):
|
|
||||||
"""将内容保存到文本文件"""
|
|
||||||
try:
|
|
||||||
with open(file_path, mode, encoding='utf-8') as f:
|
|
||||||
f.write(content)
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"保存文件{file_path}时出错: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class ImageReplacer:
|
|
||||||
def __init__(self, image_list):
|
|
||||||
self.image_list = image_list
|
|
||||||
self.current_idx = 0
|
|
||||||
|
|
||||||
def replace(self, match):
|
|
||||||
if self.current_idx < len(self.image_list):
|
|
||||||
result = f""
|
|
||||||
self.current_idx += 1
|
|
||||||
return result
|
|
||||||
return match.group()
|
|
||||||
|
|
||||||
|
|
||||||
def process_document(docx_file, txt_output_dir, img_output_dir):
|
|
||||||
# 提取图片
|
|
||||||
listImage = extract_images_from_docx(docx_file, img_output_dir)
|
|
||||||
print(f"图片数量为:{len(listImage)}")
|
|
||||||
|
|
||||||
# 读取内容
|
|
||||||
res = DocxUtil.get_docx_content_by_pandoc(docx_file)
|
|
||||||
# 分块
|
|
||||||
chunks = split_into_blocks(res)
|
|
||||||
saved_count = 0
|
|
||||||
|
|
||||||
# 使用原来的正则表达式
|
|
||||||
pattern = re.compile(r'【图片\d+】')
|
|
||||||
# 创建图片替换器
|
|
||||||
replacer = ImageReplacer(listImage)
|
|
||||||
|
|
||||||
for x in chunks:
|
|
||||||
firstLine = x[1].split("\n")[0].strip()
|
|
||||||
content = x[1][len(firstLine):].strip()
|
|
||||||
|
|
||||||
# 使用类方法替换图片
|
|
||||||
content = pattern.sub(replacer.replace, content)
|
|
||||||
# 保存文本文件
|
|
||||||
# 从docx文件名提取学科和编号
|
|
||||||
docx_name = os.path.basename(docx_file).split('.')[0]
|
|
||||||
subject_part = '_'.join(docx_name.split('_')[-2:]) # 获取最后两部分如CHINESE_1
|
|
||||||
output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt")
|
|
||||||
full_content = f"{firstLine}\n{content}"
|
|
||||||
if save_to_txt(full_content, output_file, mode='w'):
|
|
||||||
saved_count += 1
|
|
||||||
|
|
||||||
print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
txt_output_dir = "../Txt/"
|
|
||||||
img_output_dir = "../static/Images/"
|
|
||||||
# 清空上面的两个输出目录,用os进行删除,在Windows环境中进行
|
|
||||||
if os.path.exists(txt_output_dir):
|
|
||||||
shutil.rmtree(txt_output_dir)
|
|
||||||
if os.path.exists(img_output_dir):
|
|
||||||
shutil.rmtree(img_output_dir)
|
|
||||||
# 创建输出目录
|
|
||||||
os.makedirs(txt_output_dir, exist_ok=True)
|
|
||||||
os.makedirs(img_output_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# 遍历static/Txt/下所有的docx
|
|
||||||
for filename in os.listdir("../static/Txt/"):
|
|
||||||
print("正在处理文件:" + filename)
|
|
||||||
# 这里需要文件的全称路径
|
|
||||||
filename = os.path.join("../static/Txt/", filename)
|
|
||||||
process_document(filename, txt_output_dir, img_output_dir)
|
|
69
dsSchoolBuddy/Test/G2_TeachingStudent.py
Normal file
69
dsSchoolBuddy/Test/G2_TeachingStudent.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from Util import LlmUtil
|
||||||
|
|
||||||
|
|
||||||
|
def get_system_prompt():
|
||||||
|
"""获取系统提示"""
|
||||||
|
return """
|
||||||
|
你是一位平易近人且教学方法灵活的教师,通过引导学生自主学习来帮助他们掌握知识。
|
||||||
|
|
||||||
|
严格遵循以下教学规则:
|
||||||
|
1. 首先了解学生情况:在开始讲解前,询问学生的年级水平和对勾股定理的了解程度。
|
||||||
|
2. 基于现有知识构建:将新思想与学生已有的知识联系起来。
|
||||||
|
3. 引导而非灌输:使用问题、提示和小步骤,让学生自己发现答案。
|
||||||
|
4. 检查和强化:在讲解难点后,确认学生能够重述或应用这些概念。
|
||||||
|
5. 变化节奏:混合讲解、提问和互动活动,让教学像对话而非讲座。
|
||||||
|
|
||||||
|
最重要的是:不要直接给出答案,而是通过合作和基于学生已有知识的引导,帮助学生自己找到答案。
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_chat_history():
|
||||||
|
"""初始化对话历史"""
|
||||||
|
# 包含系统提示作为第一条消息
|
||||||
|
return [{
|
||||||
|
"role": "system",
|
||||||
|
"content": get_system_prompt()
|
||||||
|
}]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 初始化对话历史(包含系统提示)
|
||||||
|
chat_history = initialize_chat_history()
|
||||||
|
|
||||||
|
# 欢迎消息
|
||||||
|
print("教师助手已启动。输入 'exit' 或 '退出' 结束对话。")
|
||||||
|
print("你可以开始提问了,例如: '讲解一下勾股定理的证明'")
|
||||||
|
|
||||||
|
# 多轮对话循环
|
||||||
|
while True:
|
||||||
|
# 获取用户输入
|
||||||
|
user_input = input("\n你: ")
|
||||||
|
|
||||||
|
# 检查是否退出
|
||||||
|
if user_input.lower() in ['exit', '退出']:
|
||||||
|
print("对话已结束。")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# 添加用户输入到对话历史
|
||||||
|
chat_history.append({"role": "user", "content": user_input})
|
||||||
|
|
||||||
|
# 发送请求(传递用户输入文本和系统提示)
|
||||||
|
print("\n教师助手:")
|
||||||
|
try:
|
||||||
|
# 调用LlmUtil获取响应,传递用户输入文本和系统提示
|
||||||
|
response_content = LlmUtil.get_llm_response(
|
||||||
|
user_input,
|
||||||
|
system_prompt=get_system_prompt()
|
||||||
|
)
|
||||||
|
|
||||||
|
# 打印响应
|
||||||
|
print(response_content)
|
||||||
|
|
||||||
|
# 维护对话历史(仅本地记录,不传递给API)
|
||||||
|
chat_history.append({"role": "assistant", "content": response_content})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"发生错误: {str(e)}")
|
||||||
|
# 从对话历史中移除最后添加的用户输入,以便用户可以重试
|
||||||
|
chat_history.pop()
|
Reference in New Issue
Block a user