'commit'
This commit is contained in:
@@ -1,13 +1,14 @@
|
||||
import warnings
|
||||
import os
|
||||
import time
|
||||
from Config import Config
|
||||
import warnings
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
from langchain_core.documents import Document
|
||||
from Util.VectorUtil import text_to_vector_db # 导入向量化工具函数
|
||||
from langchain_openai import OpenAIEmbeddings # 直接导入嵌入模型
|
||||
from pydantic import SecretStr # 用于包装API密钥
|
||||
|
||||
from Config import Config
|
||||
|
||||
# 抑制HTTPS相关警告
|
||||
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
|
||||
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
|
||||
@@ -96,36 +97,6 @@ def insert_long_text_to_es(long_text: str, tags: list = None) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def process_text_directory(txt_dir: str) -> None:
|
||||
"""
|
||||
处理指定目录下的所有文本文件,将其向量化并插入到Elasticsearch
|
||||
|
||||
参数:
|
||||
txt_dir: 包含文本文件的目录路径
|
||||
"""
|
||||
for filename in os.listdir(txt_dir):
|
||||
if filename.endswith('.txt'):
|
||||
filepath = os.path.join(txt_dir, filename)
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
full_content = f.read()
|
||||
|
||||
if not full_content:
|
||||
print(f"跳过空文件: {filename}")
|
||||
continue
|
||||
|
||||
print(f"正在处理文件: {filename}")
|
||||
|
||||
# 提取标签
|
||||
x = filename.split("_")
|
||||
if len(x) >= 2:
|
||||
selected_tags = [x[0] + "_" + x[1]]
|
||||
else:
|
||||
selected_tags = ["uncategorized"]
|
||||
|
||||
# 插入文本
|
||||
insert_long_text_to_es(full_content, selected_tags)
|
||||
|
||||
|
||||
def main():
|
||||
# 示例1:插入单个长文本
|
||||
long_text = """混凝土是一种广泛使用的建筑材料,由水泥、砂、石子和水混合而成。它具有高强度、耐久性和良好的可塑性,被广泛应用于建筑、桥梁、道路等土木工程领域。
|
||||
|
Reference in New Issue
Block a user