QingLong/AI/Neo4j/N3_InputShiTi.py

# -*- coding: utf-8 -*-
import re
import hashlib
from py2neo import Graph
from openai import OpenAI
from Config import *


class KnowledgeGraph:
    def __init__(self, content: str):
        self.content = content
        self.question_id = hashlib.md5(content.encode()).hexdigest()[:8]
        self.graph = Graph(NEO4J_URI, auth=NEO4J_AUTH)
        self.knowledge_points = self._get_knowledge_points()
        self.client = OpenAI(api_key=MODEL_API_KEY, base_url=MODEL_API_URL)

        # self.knowledge_points = self._get_knowledge_points()
        print("加载知识点数量：", len(self.knowledge_points))  # 添加调试信息

    def _get_knowledge_points(self) -> dict:
        """保持ID原始大小写"""
        try:
            # 移除lower()转换
            return {row['n.id']: row['n.name']  # 直接使用原始ID
                    for row in self.graph.run("MATCH (n:KnowledgePoint) RETURN n.id, n.name")}
        except Exception as e:
            print(f"获取知识点失败：", str(e))
            return {}

    def _make_prompt(self) -> str:
        """生成知识点识别专用提示词"""
        example_ids = list(self.knowledge_points.keys())[:5]
        example_names = [self.knowledge_points[k] for k in example_ids]

        return f"""你是一个数学专家，请分析题目考查的知识点，严格：
1. 只使用以下存在的知识点（格式：ID:名称）：
{", ".join([f"{k}:{v}" for k, v in zip(example_ids, example_names)])}...
共{len(self.knowledge_points)}个可用知识点
2. 题目可能包含多个知识点，让仔细检查。
3. 按此格式生成Cypher：
MERGE (q:Question {{id: "{self.question_id}"}})
SET q.content = "题目内容"
WITH q
MATCH (kp:KnowledgePoint {{id: "知识点ID"}})
MERGE (q)-[:TESTS_KNOWLEDGE]->(kp)"""

    def _clean_cypher(self, code: str) -> str:
        """完整Cypher清洗逻辑（增强版）"""
        safe = []
        content_keywords = {
            '行程问题': ['相遇', '相向而行', '追及', '速度', '路程'],
            '几何问题': ['面积', '体积', '周长', '三角形', '长方体'],
            '分数运算': ['分数', '百分比', '%', '分之']
        }

        try:
            # 提取代码块
            cypher_block = re.findall(r"```(?:cypher)?\n(.*?)```", code, re.DOTALL)
            if not cypher_block:
                print("未检测到Cypher代码块")
                return ""

            # 预处理配置
            valid_ids_upper = [k.upper() for k in self.knowledge_points.keys()]
            detected_types = []
            raw_lines = cypher_block[0].split('\n')
            has_question = False

            # === 第一步：基础清洗 ===
            for line in raw_lines:
                # 清理注释和空白
                clean_line = line.split('//')[0].strip()
                if not clean_line:
                    continue

                # 阻止CREATE操作
                if 'CREATE' in clean_line.upper():
                    print(f"阻止CREATE操作: {clean_line}")
                    continue

                # 强制Question节点在最前面
                if 'MERGE (q:Question' in clean_line:
                    has_question = True
                    safe.insert(0, clean_line)
                    continue

                safe.append(clean_line)

            # === 第二步：检测题目类型 ===
            for pattern, keys in content_keywords.items():
                if any(k in self.content for k in keys):
                    detected_types.append(pattern)
                    print(f"检测到题目类型: {pattern}")

            # === 第三步：处理知识点ID ===
            knowledge_lines = []
            for line in safe.copy():
                if 'MATCH (kp:KnowledgePoint' in line:
                    # 安全提取ID
                    match = re.search(r"id: ['\"](.*?)['\"]", line)
                    if not match:
                        print(f"无效的MATCH语句: {line}")
                        safe.remove(line)
                        continue

                    original_id = match.group(1)
                    upper_id = original_id.upper()

                    # 验证ID存在性
                    if upper_id not in valid_ids_upper:
                        print(f"忽略无效知识点ID: {original_id}")
                        safe.remove(line)
                        continue

                    # 替换为正确的大写ID
                    new_line = line.replace(original_id, upper_id)
                    safe[safe.index(line)] = new_line
                    knowledge_lines.append(new_line)

                # === 第四步：自动补充知识点 ===
                for dtype in detected_types:
                    # 安全获取已关联知识点ID
                    extracted_ids = []
                    for line in knowledge_lines:
                        try:
                            match = re.search(r"id: ['\"](.*?)['\"]", line)
                            if match:
                                kp_id = match.group(1).upper()
                                extracted_ids.append(kp_id)
                        except AttributeError:
                            continue

                    # 获取对应的知识点名称（确保为字符串）
                    type_exists = any(
                        dtype in str(self.knowledge_points.get(kp_id, ''))
                        for kp_id in extracted_ids
                    )

                    if not type_exists:
                        # 查找匹配的知识点（添加空值过滤）
                        candidates = [
                            (k, v) for k, v in self.knowledge_points.items()
                            if v and dtype in str(v)  # 确保v是字符串
                               and k.upper() in valid_ids_upper
                        ]

                        # 按名称匹配度排序
                        candidates.sort(key=lambda x: (
                            dtype in x[1],  # 优先完全匹配
                            -len(x[1])  # 次优先名称长度短的
                        ), reverse=True)

                        if candidates:
                            target_id, target_name = candidates[0]
                            print(f"补充知识点: {target_id} - {target_name}")
                            safe.extend([
                                "WITH q",
                                f"MATCH (kp:KnowledgePoint {{id: \"{target_id.upper()}\"}})",
                                "MERGE (q)-[:TESTS_KNOWLEDGE]->(kp)"
                            ])
                        else:
                            print(f"未找到匹配的{dtype}知识点")

            # === 第五步：语法修正 ===
            # 确保Question节点后紧跟WITH
            if has_question:
                for i, line in enumerate(safe):
                    if 'MERGE (q:Question' in line:
                        # 检查下一条是否是WITH
                        if i + 1 >= len(safe) or not safe[i + 1].startswith('WITH'):
                            safe.insert(i + 1, "WITH q")
                        break

            # 移除重复的WITH语句
            final_safe = []
            prev_was_with = False
            for line in safe:
                if line.startswith('WITH'):
                    if not prev_was_with:
                        final_safe.append(line)
                    prev_was_with = True
                else:
                    final_safe.append(line)
                    prev_was_with = False

            return '\n'.join(final_safe)

        except Exception as e:
            print(f"清洗Cypher时发生错误: {str(e)}")
            return ""

    def run(self) -> str:
        """执行知识点关联流程"""
        try:
            response = self.client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {
                        "role": "system",
                        "content": self._make_prompt()
                    },
                    {
                        "role": "user",
                        "content": f"题目内容：{self.content}\n请分析考查的知识点，只返回Cypher代码"
                    }
                ]
            )

            raw_cypher = response.choices[0].message.content
            cleaned_cypher = self._clean_cypher(raw_cypher)

            if cleaned_cypher:
                print("验证通过的Cypher：\n", cleaned_cypher)
                return cleaned_cypher
            return ""

        except Exception as e:
            print("知识点分析失败：", str(e))
            return ""

    def query_related_knowledge(self):
        """查询题目关联的知识点"""
        cypher = f"""
        MATCH (q:Question {{id: "{self.question_id}"}})-[:TESTS_KNOWLEDGE]->(kp)
        RETURN kp.id AS knowledge_id, kp.name AS knowledge_name
        """
        try:
            result = self.graph.run(cypher).data()
            if result:
                print(f"题目关联的知识点（{self.question_id}）：")
                for row in result:
                    print(f"- {row['knowledge_name']} (ID: {row['knowledge_id']})")
            else:
                print("该题目尚未关联知识点")
            return result
        except Exception as e:
            print("查询失败：", str(e))
            return []


# 切割试题
def split_questions(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 使用正则表达式匹配题目块（包含答案）
    pattern = r'(\d+\.\s+【.*?】.*?(?=\n\d+\.|\Z))'
    questions = re.findall(pattern, content, re.DOTALL)

    # 清洗每个题目块的空白字符
    cleaned_questions = [q.strip() for q in questions]

    return cleaned_questions[:10]  # 确保只返回前10题


# 测试用例
if __name__ == '__main__':
    # 分段读入题目
    question_blocks = split_questions('Backup/ShiTi.md')

    # 验证分割结果
    for i, block in enumerate(question_blocks, 1):
        print(f"第{i}题块：")
        print("-" * 50)
        kg = KnowledgeGraph(block)
        cypher = kg.run()
        if cypher:
            # 插入数据
            kg.graph.run(cypher)
            print("执行成功！关联知识点：")
            kg.query_related_knowledge()  # 新增查询
        else:
            print("未生成有效Cypher")

    '''
    # 基本可视化查询
    MATCH path=(q:Question {id: "07ece550"})-[:TESTS_KNOWLEDGE]->(kp)
    RETURN path
    
    # 带样式的可视化
    MATCH (q:Question {id: "07ece550"})-[:TESTS_KNOWLEDGE]->(kp)
    RETURN q, kp
    // 在浏览器中点击左侧样式图标，设置：
    // - Question节点颜色：橙色
    // - KnowledgePoint节点颜色：蓝色
    // - 关系线宽：3px
    '''