'commit'

2025-08-19 07:45:15 +08:00
parent fa53b79d4e
commit 268f1f4172
5 changed files with 109 additions and 192 deletions
--- a/dsSchoolBuddy/Config/pycache/Config.cpython-310.pyc
+++ b/dsSchoolBuddy/Config/pycache/Config.cpython-310.pyc
--- a/dsSchoolBuddy/ElasticSearch/T1_RebuildMapping.py
+++ b/dsSchoolBuddy/ElasticSearch/T1_RebuildMapping.py
@@ -21,7 +21,7 @@ mapping = {
        "properties": {
            "embedding": {
                "type": "dense_vector",
-                "dims": 200,  # embedding维度为200
+                "dims": 1024,  # embedding维度为1024
                "index": True,
                "similarity": "l2_norm"  # 使用L2距离
            },
--- a/dsSchoolBuddy/ElasticSearch/T2_BgeM3.py
+++ b/dsSchoolBuddy/ElasticSearch/T2_BgeM3.py
@@ -0,0 +1,39 @@
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_openai import OpenAIEmbeddings
 from langchain_core.vectorstores import InMemoryVectorStore
 import os
 from Config.Config import EMBED_MODEL_NAME, EMBED_BASE_URL, EMBED_API_KEY
 # 设置环境变量
 os.environ["OPENAI_BASE_URL"] = EMBED_BASE_URL
 os.environ["OPENAI_API_KEY"] = EMBED_API_KEY
 # 加载文档
 file_path = "../langchain/data/0001.pdf"
 loader = PyPDFLoader(file_path)
 docs = loader.load()
 print(f"文档页数：{len(docs)} 页")
 # 切割文档
 text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, add_start_index=True
 )
 all_splits = text_splitter.split_documents(docs)
 len(all_splits)
 # 嵌入模型
 embeddings = OpenAIEmbeddings(model=EMBED_MODEL_NAME)
 # 向量存储
 vector_store = InMemoryVectorStore(embeddings)
 ids = vector_store.add_documents(documents=all_splits)
 # 向量查询
 results = vector_store.similarity_search(
    "混凝土"
 )
 print(results[0])
--- a/dsSchoolBuddy/ElasticSearch/T2_SplitTxt.py
+++ b/dsSchoolBuddy/ElasticSearch/T2_SplitTxt.py
@@ -1,191 +0,0 @@
 import os
 import re
 import shutil
 import warnings
 import zipfile
 from docx import Document
 from docx.oxml.ns import nsmap
 from Util import DocxUtil
 # 抑制HTTPS相关警告
 warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
 warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
 def extract_images_from_docx(docx_path, output_folder):
    """
    从docx提取图片并记录位置
    :param docx_path: Word文档路径
    :param output_folder: 图片输出文件夹
    :return: 包含图片路径和位置的列表
    """
    # 从docx_path 的名称示例：小学数学教学中的若干问题_MATH_1.docx
    # 则图片的前缀统一为 MATH_1_?.docx ,其中 ? 为数字，表示图片的序号
    # 先获取到前缀
    a = docx_path.split("_")
    prefix = a[1] + "_" + a[2].split(".")[0]
    # print(f"图片前缀为：{prefix}")
    # 创建一个List<String> 记录每个图片的名称和序号
    image_data = []
    # 创建临时解压目录
    temp_dir = os.path.join(output_folder, "temp_docx")
    os.makedirs(temp_dir, exist_ok=True)
    # 解压docx文件
    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)
    # 读取主文档关系
    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
        rels_content = rels_file.read()
    # 加载主文档
    doc = Document(docx_path)
    img_counter = 1
    # 遍历所有段落
    for para_idx, paragraph in enumerate(doc.paragraphs):
        for run_idx, run in enumerate(paragraph.runs):
            # 检查运行中的图形
            for element in run._element:
                if element.tag.endswith('drawing'):
                    # 提取图片关系ID
                    blip = element.find('.//a:blip', namespaces=nsmap)
                    if blip is not None:
                        embed_id = blip.get('{%s}embed' % nsmap['r'])
                        # 从关系文件中获取图片文件名
                        rel_entry = f'<Relationship Id="{embed_id}"'
                        if rel_entry in rels_content:
                            start = rels_content.find(rel_entry)
                            target_start = rels_content.find('Target="', start) + 8
                            target_end = rels_content.find('"', target_start)
                            image_path = rels_content[target_start:target_end]
                            # 构建图片源路径
                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
                            if os.path.exists(src_path):
                                # 创建输出文件名
                                ext = os.path.splitext(src_path)[1]
                                fileName = prefix + "_" + str(img_counter)
                                img_name = f"{fileName}{ext}"
                                image_data.append(img_name)
                                dest_path = os.path.join(output_folder, img_name)
                                # 复制图片
                                shutil.copy(src_path, dest_path)
                                img_counter += 1
    # 清理临时目录
    shutil.rmtree(temp_dir)
    return image_data
 def split_into_blocks(text):
    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
    blocks = []
    current_block = []
    in_block = False
    for line in text.splitlines():
        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
            if in_block:
                blocks.append('\n'.join(current_block))
                current_block = []
            in_block = True
            # 循环移除问题和话题前缀后的数字
            if line and line.startswith(('问题', '话题')):
                while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
                    if line.startswith(('问题', '话题')):
                        line = line[2:] if len(line) > 2 else line
                    elif line and line[0].isdigit():
                        line = line[1:]
                    line = line.strip()
        if in_block and line:  # 只添加非空行
            current_block.append(line)
    if current_block:
        blocks.append('\n'.join(current_block))
    return [(i + 1, block) for i, block in enumerate(blocks)]
 def save_to_txt(content, file_path, mode='w'):
    """将内容保存到文本文件"""
    try:
        with open(file_path, mode, encoding='utf-8') as f:
            f.write(content)
        return True
    except Exception as e:
        print(f"保存文件{file_path}时出错: {str(e)}")
        return False
 class ImageReplacer:
    def __init__(self, image_list):
        self.image_list = image_list
        self.current_idx = 0
    def replace(self, match):
        if self.current_idx < len(self.image_list):
            result = f"![](./Images/{self.image_list[self.current_idx]})"
            self.current_idx += 1
            return result
        return match.group()
 def process_document(docx_file, txt_output_dir, img_output_dir):
    # 提取图片
    listImage = extract_images_from_docx(docx_file, img_output_dir)
    print(f"图片数量为：{len(listImage)}")
    # 读取内容
    res = DocxUtil.get_docx_content_by_pandoc(docx_file)
    # 分块
    chunks = split_into_blocks(res)
    saved_count = 0
    # 使用原来的正则表达式
    pattern = re.compile(r'【图片\d+】')
    # 创建图片替换器
    replacer = ImageReplacer(listImage)
    for x in chunks:
        firstLine = x[1].split("\n")[0].strip()
        content = x[1][len(firstLine):].strip()
        # 使用类方法替换图片
        content = pattern.sub(replacer.replace, content)
        # 保存文本文件
        # 从docx文件名提取学科和编号
        docx_name = os.path.basename(docx_file).split('.')[0]
        subject_part = '_'.join(docx_name.split('_')[-2:])  # 获取最后两部分如CHINESE_1
        output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt")
        full_content = f"{firstLine}\n{content}"
        if save_to_txt(full_content, output_file, mode='w'):
            saved_count += 1
    print(f"处理完成，共保存{saved_count}个文件到目录: {txt_output_dir}")
 if __name__ == "__main__":
    txt_output_dir = "../Txt/"
    img_output_dir = "../static/Images/"
    # 清空上面的两个输出目录,用os进行删除,在Windows环境中进行
    if os.path.exists(txt_output_dir):
        shutil.rmtree(txt_output_dir)
    if os.path.exists(img_output_dir):
        shutil.rmtree(img_output_dir)
    # 创建输出目录
    os.makedirs(txt_output_dir, exist_ok=True)
    os.makedirs(img_output_dir, exist_ok=True)
    # 遍历static/Txt/下所有的docx
    for filename in os.listdir("../static/Txt/"):
        print("正在处理文件：" + filename)
        # 这里需要文件的全称路径
        filename = os.path.join("../static/Txt/", filename)
        process_document(filename, txt_output_dir, img_output_dir)
--- a/dsSchoolBuddy/Test/G2_TeachingStudent.py
+++ b/dsSchoolBuddy/Test/G2_TeachingStudent.py
@@ -0,0 +1,69 @@
 import sys
 from Util import LlmUtil
 def get_system_prompt():
    """获取系统提示"""
    return """
    你是一位平易近人且教学方法灵活的教师，通过引导学生自主学习来帮助他们掌握知识。
    严格遵循以下教学规则：
    1. 首先了解学生情况：在开始讲解前，询问学生的年级水平和对勾股定理的了解程度。
    2. 基于现有知识构建：将新思想与学生已有的知识联系起来。
    3. 引导而非灌输：使用问题、提示和小步骤，让学生自己发现答案。
    4. 检查和强化：在讲解难点后，确认学生能够重述或应用这些概念。
    5. 变化节奏：混合讲解、提问和互动活动，让教学像对话而非讲座。
    最重要的是：不要直接给出答案，而是通过合作和基于学生已有知识的引导，帮助学生自己找到答案。
    """
 def initialize_chat_history():
    """初始化对话历史"""
    # 包含系统提示作为第一条消息
    return [{
        "role": "system",
        "content": get_system_prompt()
    }]
 if __name__ == "__main__":
    # 初始化对话历史（包含系统提示）
    chat_history = initialize_chat_history()
    # 欢迎消息
    print("教师助手已启动。输入 'exit' 或 '退出' 结束对话。")
    print("你可以开始提问了，例如: '讲解一下勾股定理的证明'")
    # 多轮对话循环
    while True:
        # 获取用户输入
        user_input = input("\n你: ")
        # 检查是否退出
        if user_input.lower() in ['exit', '退出']:
            print("对话已结束。")
            sys.exit(0)
        # 添加用户输入到对话历史
        chat_history.append({"role": "user", "content": user_input})
        # 发送请求（传递用户输入文本和系统提示）
        print("\n教师助手:")
        try:
            # 调用LlmUtil获取响应，传递用户输入文本和系统提示
            response_content = LlmUtil.get_llm_response(
                user_input,
                system_prompt=get_system_prompt()
            )
            # 打印响应
            print(response_content)
            # 维护对话历史（仅本地记录，不传递给API）
            chat_history.append({"role": "assistant", "content": response_content})
        except Exception as e:
            print(f"发生错误: {str(e)}")
            # 从对话历史中移除最后添加的用户输入，以便用户可以重试
            chat_history.pop()