This commit is contained in:
2025-08-19 08:16:58 +08:00
parent 21638bd322
commit 52369e2b21

View File

@@ -1,13 +1,14 @@
import warnings
import os import os
import time import time
from Config import Config import warnings
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from langchain_core.documents import Document from langchain_core.documents import Document
from Util.VectorUtil import text_to_vector_db # 导入向量化工具函数
from langchain_openai import OpenAIEmbeddings # 直接导入嵌入模型 from langchain_openai import OpenAIEmbeddings # 直接导入嵌入模型
from pydantic import SecretStr # 用于包装API密钥 from pydantic import SecretStr # 用于包装API密钥
from Config import Config
# 抑制HTTPS相关警告 # 抑制HTTPS相关警告
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
@@ -96,36 +97,6 @@ def insert_long_text_to_es(long_text: str, tags: list = None) -> bool:
return False return False
def process_text_directory(txt_dir: str) -> None:
"""
处理指定目录下的所有文本文件将其向量化并插入到Elasticsearch
参数:
txt_dir: 包含文本文件的目录路径
"""
for filename in os.listdir(txt_dir):
if filename.endswith('.txt'):
filepath = os.path.join(txt_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
full_content = f.read()
if not full_content:
print(f"跳过空文件: {filename}")
continue
print(f"正在处理文件: {filename}")
# 提取标签
x = filename.split("_")
if len(x) >= 2:
selected_tags = [x[0] + "_" + x[1]]
else:
selected_tags = ["uncategorized"]
# 插入文本
insert_long_text_to_es(full_content, selected_tags)
def main(): def main():
# 示例1插入单个长文本 # 示例1插入单个长文本
long_text = """混凝土是一种广泛使用的建筑材料,由水泥、砂、石子和水混合而成。它具有高强度、耐久性和良好的可塑性,被广泛应用于建筑、桥梁、道路等土木工程领域。 long_text = """混凝土是一种广泛使用的建筑材料,由水泥、砂、石子和水混合而成。它具有高强度、耐久性和良好的可塑性,被广泛应用于建筑、桥梁、道路等土木工程领域。