From f0cefdfdff14ab539ebccc167e3c36d9fa61d94c Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 19 Aug 2025 09:13:07 +0800 Subject: [PATCH] 'commit' --- dsSchoolBuddy/ElasticSearch/T3_InsertData.py | 130 +---------------- .../ElasticSearch/Utils/EsSearchUtil.py | 131 +++++++++++++++++- .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 0 -> 5366 bytes 3 files changed, 131 insertions(+), 130 deletions(-) create mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__pycache__/EsSearchUtil.cpython-310.pyc diff --git a/dsSchoolBuddy/ElasticSearch/T3_InsertData.py b/dsSchoolBuddy/ElasticSearch/T3_InsertData.py index 7bf50549..ecd207bd 100644 --- a/dsSchoolBuddy/ElasticSearch/T3_InsertData.py +++ b/dsSchoolBuddy/ElasticSearch/T3_InsertData.py @@ -1,140 +1,12 @@ -import hashlib # 导入哈希库 -import time import warnings -from elasticsearch import Elasticsearch -from langchain_core.documents import Document -from langchain_text_splitters import RecursiveCharacterTextSplitter -from langchain_openai import OpenAIEmbeddings -from pydantic import SecretStr - -from Config import Config +from ElasticSearch.Utils.EsSearchUtil import insert_long_text_to_es # 抑制HTTPS相关警告 warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') -def split_text_into_chunks(text: str, chunk_size: int = 200, chunk_overlap: int = 0) -> list: - """ - 将文本切割成块 - - 参数: - text: 要切割的文本 - chunk_size: 每个块的大小 - chunk_overlap: 块之间的重叠大小 - - 返回: - list: 文本块列表 - """ - # 创建文档对象 - docs = [Document(page_content=text, metadata={"source": "simulated_document"})] - - # 切割文档 - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True - ) - all_splits = text_splitter.split_documents(docs) - print(f"切割后的文档块数量:{len(all_splits)}") - - return [split.page_content for split in all_splits] - - -def insert_long_text_to_es(long_text: str, tags: list = None) -> bool: - """ - 将长文本切割后向量化并插入到Elasticsearch,基于文本内容哈希实现去重 - - 参数: - long_text: 要插入的长文本 - tags: 可选的标签列表 - - 返回: - bool: 插入是否成功 - """ - try: - # 1. 初始化Elasticsearch连接 - es = Elasticsearch( - hosts=Config.ES_CONFIG['hosts'], - basic_auth=Config.ES_CONFIG['basic_auth'], - verify_certs=False - ) - - # 2. 检查索引是否存在,不存在则创建 - index_name = Config.ES_CONFIG['index_name'] - if not es.indices.exists(index=index_name): - # 定义mapping结构 - mapping = { - "mappings": { - "properties": { - "embedding": { - "type": "dense_vector", - "dims": 1024, # 根据实际embedding维度调整 - "index": True, - "similarity": "l2_norm" - }, - "user_input": {"type": "text"}, - "tags": { - "type": "object", - "properties": { - "tags": {"type": "keyword"}, - "full_content": {"type": "text"} - } - }, - "timestamp": {"type": "date"} - } - } - } - es.indices.create(index=index_name, body=mapping) - print(f"索引 '{index_name}' 创建成功") - - # 3. 切割文本 - text_chunks = split_text_into_chunks(long_text) - - # 4. 准备标签 - if tags is None: - tags = ["general_text"] - - # 5. 获取当前时间 - timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - - # 6. 创建嵌入模型 - embeddings = OpenAIEmbeddings( - model=Config.EMBED_MODEL_NAME, - base_url=Config.EMBED_BASE_URL, - api_key=SecretStr(Config.EMBED_API_KEY) - ) - - # 7. 为每个文本块生成向量并插入 - for i, chunk in enumerate(text_chunks): - # 生成文本块的哈希值作为文档ID - doc_id = hashlib.md5(chunk.encode('utf-8')).hexdigest() - - # 检查文档是否已存在 - if es.exists(index=index_name, id=doc_id): - print(f"文档块 {i+1} 已存在,跳过插入: {doc_id}") - continue - - # 生成文本块的嵌入向量 - embedding = embeddings.embed_documents([chunk])[0] - - # 准备文档数据 - doc = { - 'tags': {"tags": tags, "full_content": long_text}, - 'user_input': chunk, - 'timestamp': timestamp, - 'embedding': embedding - } - - # 插入数据到Elasticsearch - es.index(index=index_name, id=doc_id, document=doc) - print(f"文档块 {i+1} 插入成功: {doc_id}") - - return True - except Exception as e: - print(f"插入数据失败: {e}") - return False - - def main(): # 示例1:插入单个长文本 long_text = """混凝土是一种广泛使用的建筑材料,由水泥、砂、石子和水混合而成。它具有高强度、耐久性和良好的可塑性,被广泛应用于建筑、桥梁、道路等土木工程领域。 diff --git a/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py b/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py index 4e885056..033858d9 100644 --- a/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py +++ b/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py @@ -1,9 +1,16 @@ import logging import warnings - +import hashlib # 导入哈希库 +import time from Config.Config import ES_CONFIG from ElasticSearch.Utils.ElasticsearchConnectionPool import ElasticsearchConnectionPool +from elasticsearch import Elasticsearch +from langchain_core.documents import Document +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_openai import OpenAIEmbeddings +from pydantic import SecretStr +from Config import Config # 抑制HTTPS相关警告 warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') @@ -50,6 +57,128 @@ class EsSearchUtil: self.es_pool.release_connection(conn) + +def split_text_into_chunks(text: str, chunk_size: int = 200, chunk_overlap: int = 0) -> list: + """ + 将文本切割成块 + + 参数: + text: 要切割的文本 + chunk_size: 每个块的大小 + chunk_overlap: 块之间的重叠大小 + + 返回: + list: 文本块列表 + """ + # 创建文档对象 + docs = [Document(page_content=text, metadata={"source": "simulated_document"})] + + # 切割文档 + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True + ) + all_splits = text_splitter.split_documents(docs) + print(f"切割后的文档块数量:{len(all_splits)}") + + return [split.page_content for split in all_splits] + + +def insert_long_text_to_es(long_text: str, tags: list = None) -> bool: + """ + 将长文本切割后向量化并插入到Elasticsearch,基于文本内容哈希实现去重 + + 参数: + long_text: 要插入的长文本 + tags: 可选的标签列表 + + 返回: + bool: 插入是否成功 + """ + try: + # 1. 初始化Elasticsearch连接 + es = Elasticsearch( + hosts=Config.ES_CONFIG['hosts'], + basic_auth=Config.ES_CONFIG['basic_auth'], + verify_certs=False + ) + + # 2. 检查索引是否存在,不存在则创建 + index_name = Config.ES_CONFIG['index_name'] + if not es.indices.exists(index=index_name): + # 定义mapping结构 + mapping = { + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": 1024, # 根据实际embedding维度调整 + "index": True, + "similarity": "l2_norm" + }, + "user_input": {"type": "text"}, + "tags": { + "type": "object", + "properties": { + "tags": {"type": "keyword"}, + "full_content": {"type": "text"} + } + }, + "timestamp": {"type": "date"} + } + } + } + es.indices.create(index=index_name, body=mapping) + print(f"索引 '{index_name}' 创建成功") + + # 3. 切割文本 + text_chunks = split_text_into_chunks(long_text) + + # 4. 准备标签 + if tags is None: + tags = ["general_text"] + + # 5. 获取当前时间 + timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + + # 6. 创建嵌入模型 + embeddings = OpenAIEmbeddings( + model=Config.EMBED_MODEL_NAME, + base_url=Config.EMBED_BASE_URL, + api_key=SecretStr(Config.EMBED_API_KEY) + ) + + # 7. 为每个文本块生成向量并插入 + for i, chunk in enumerate(text_chunks): + # 生成文本块的哈希值作为文档ID + doc_id = hashlib.md5(chunk.encode('utf-8')).hexdigest() + + # 检查文档是否已存在 + if es.exists(index=index_name, id=doc_id): + print(f"文档块 {i+1} 已存在,跳过插入: {doc_id}") + continue + + # 生成文本块的嵌入向量 + embedding = embeddings.embed_documents([chunk])[0] + + # 准备文档数据 + doc = { + 'tags': {"tags": tags, "full_content": long_text}, + 'user_input': chunk, + 'timestamp': timestamp, + 'embedding': embedding + } + + # 插入数据到Elasticsearch + es.index(index=index_name, id=doc_id, document=doc) + print(f"文档块 {i+1} 插入成功: {doc_id}") + + return True + except Exception as e: + print(f"插入数据失败: {e}") + return False + + + # 添加main函数进行测试 if __name__ == "__main__": try: diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/EsSearchUtil.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/EsSearchUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f081a8f832016bf144f6e65ab75173a36646cd9 GIT binary patch literal 5366 zcmb7IX>b(B6`p$!X;(r>LK4?-2v{uSqbiAOgQ;K;h6Dt$a406YriSg7G-CFkXBLoZ z`}OPN zz4yJZl}{w%0{l*X{dRwRsUZ9lKdOJ&`wMtnK@tQa5YZ8g@GTl5|CS7izhy&)x8x{U z)lfy0ksU1?F(SOKIMHm(i1E7W#IrR<4X>^_kug9E* zY=hAt3RE0eOyZ=5)Q(7VUy1UUFy2U;CPbr|Hjgi%!a&P}C<;#rBuVN{3#9IpWVDip z$s*cD8t7887@kJDtT#%UNb_mcSWX@xOQ5fXE`#kdiA1|tPy+vx_7TsJ0j$#$}yrpSg?0p}{Wq@ZPEt1!NiJVZJs zq^h1xC~@@W0kBk$pa}G#!F!@$bcn)G8?1Q~>7;G616NXBcYAdUjy)h8gt11)Z#z&! zSUWBr=lW8Q&Z2L0r$j&2)njhk+r4AgQ}D60%duR~&bZWKnf`70T#jZuJD=N^&pXha zSkMg((e3$6F-vnE)K@%BGezdw$LO|xi&+_uvi9hdh5+MpvPm6{^v>qW9^=rG_9)5(+ zc(#*)fSdbaJp#`*c-);(l*E!?3Z7UJiP#EXM8a}hA~I1XB%%^+L;(pIdLqyxn2M*4 zYb06{29%2NNepI<7e4@Xas7c`aBeJk@5SK4s|##5b$N99+e=sCn0Q+Ti z>Y5pDkF>sE?ChD5>A&3!Zd?t{eERL33&Djm!R0^Wp}8GkAIfr8mUid`%XN?E8R-CX z5PHJI*6ADPe=wP^@}YkGu;tnr(<*xXer>f~cg#Kp`${P+0WX%sq7Dk9E^D2@1(sdZ z_2css6+1wp+6Hr{Fm>*7@cV19Y~22xE5YTF>GvkndUq<~$L2Q7(AvJ-M?F6RjRo+2 zwg{UvC*RjcnXlNn-h4{(6_+}_tO2_ID6kfIeb==;eTcXR^6Ut__Obi`1Q67EGX0?B z$s!?xhsuW;I>;gP5cg2^T00Aae$+JWob8$B5>V>m1ez+zny86!Q4*tfEL?SRQ(GsI?>yNYEmq{}J%Jl4!$UNg~RKJgk(I z0{|s_gi?|bA`y-}b?~R6FbwHosK(RABSiJ0B>=>VM?Mc}V_1DoMZl>Zdt69q-QiiP zSC zB-p9S5qvE~FK!m2fl-HAF0B@bb)Cr20^%BL0tLQG8Ot+Yb^(Mmh9}NeVxeQFf-dH3 zjJid~^F_+d92TwEU*iGDnc8Jxxt&$^SN@;Z92Tlj=~*0>f1) z06=DtyoHpC9)KZZntt3gvw2cWKcWvmv8W_Vn*e^iKy z$p+pw`*-;E6`a6cLPz`BTnwn*d= zsZSi15y{YZh#Gp&xP)Fata>uhJf-3zn4kbPqo;+Ex*ZZK;8(2pH7Mlx5KO)h8az06 zHaPzw#_ZVDc-VF6`Kc?D={Xp|U8MDycYha(PQQLS962X2zrnOVb?eeMpZ*DSfmra) zd%@(T2gJ&IPwOD_&1WypUi}y(XU~oVm&VKTe89|CuLN(rIll< z#B@E2c@QQ5rA16Z;dX<|7t1@IcpH`sCo_BY(zkbB&xjyW`C-aIIOu_gLrEVP?OAmg z3^gFUBpsD{Le~M`<47y{Qs*PS>SE%4z^DB9X54EgpDk?R>QHxsFtm{io7|J_oR>HG zsk@zmZojk<7LQ3W9xz!){DCAp-KW@OHuMwXOh;Fv8T$Mhm42&Q2b$UPK&Cd((p330p;5=IbTz&~+2KZ6%*(Xz% z{t%qG9-Nz8z;}@2T>Jc+&o7oozHlbE`C0Jtx!}_a!Ob_PFHQy*zW|K*aeSI{Ck|5K z1c)h>b<6<}Ive25rCYNn&tpuFpPjz(Wf@6+6m$*)p#ad~OP+e|R&e>KqM4cVN&zK9{ZUJuT_5q$A^ zcplwGZ6D21W;r~phZeOT+|-`k1UI|(o$2;H>GmGv9gtzEV?=?3QnSb$BVrY76V^SX z7QNn0k5^`tY*Iu7Kv&?9tIs0T7BOk-!R-%&8?ObU?}Nw8-2Q0htFxhX(mF#Sps=`) za7Wv*hG`VcgL`35LE%R$NLWOQTQM%DtZ6+YLZQ8{Oiqp53@SvZg@J_x7Q+G&*^fY? zurt*4HF^So7v6Bmf}>?WLlF*i3ZAAP1V||dQg&4%q+DT&VR(>6e9m87 zU$lLd_Zb?TrfCy?%)SgCY&~k7=SrNwHRE*=a_d~AnvlP!`x_8-b5KY%5-_SQLV#^heRj=A%PqhLgMBHhzG>7GapQy2V|MleWl;_ zT<*{M)Y!!I7cWn}Im&jT0(61!TmMc>bA7>=Kx|$}>xO2!8MvK?jT{Wqx*st^qQkvA zT=muQfcFkPw(0-=@i?CP>x(nDuFQG4#PC)ZhLYHw>hvRc8w2d}qsJ|lLl9%XMu|O8 z7>RBC?%El?cvu(@FSvSj@~c?eC|2jaG}K|cnUo!$Hr%J8-k zUSOgwG}2?#@s(ZOJN9yx&(=b>FC&{$kbIS+9O)yn^Z40h;4A6((Wd;Mp5UnqWtTB8vRob literal 0 HcmV?d00001