dsProject/dsRag/Tools/T2_ImportTxt.py

import datetime
import logging

from elasticsearch import Elasticsearch
from tqdm import tqdm

from Config.Config import ES_CONFIG
from Util.EmbeddingUtil import text_to_embedding

# 在文件开头添加logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# 创建控制台handler并设置格式
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

def split_paragraphs(text):
    """按段落分割文本"""
    # 按两个换行符分割段落
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    return paragraphs

def save_to_es(text):
    """保存向量化文本和原始文本到ES"""
    vector = text_to_embedding(text)
    
    if vector is None:
        logger.warning(f"跳过无法生成向量的文本: {text}")
        return
    
    doc = {
        'text': text,
        'vector': vector,
        'timestamp': datetime.datetime.now().isoformat(),
        'analyzer': 'ik_smart'
    }
    
    try:
        es.index(index='knowledge_base', body=doc)
        es.index(index='raw_texts', body={'raw_text': text})
    except Exception as e:
        logger.error(f"保存文本到ES失败: {e}")

def process_file(file_path):
    """处理文本文件"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    paragraphs = split_paragraphs(content)
    
    for paragraph in tqdm(paragraphs, desc='处理进度', unit='段'):
        save_to_es(paragraph)
    
    print(f"\n处理完成，共保存{len(paragraphs)}个段落")

if __name__ == '__main__':
    es = Elasticsearch(
        hosts=[ES_CONFIG['hosts']],
        basic_auth=ES_CONFIG['basic_auth'],
        verify_certs=ES_CONFIG['verify_certs'],
        ssl_show_warn=ES_CONFIG['ssl_show_warn']
    )
    
    file_path = '../Txt/人口变化趋势对云南教育的影响.txt'
    process_file(file_path)
'commit' 1 month ago			`import datetime`
			`import logging`
'commit' 1 month ago
			`from elasticsearch import Elasticsearch`
			`from tqdm import tqdm`
'commit' 1 month ago
			`from Config.Config import ES_CONFIG`
			`from Util.EmbeddingUtil import text_to_embedding`
'commit' 1 month ago
			`# 在文件开头添加logger配置`
			`logger = logging.getLogger(__name__)`
			`logger.setLevel(logging.INFO)`

			`# 创建控制台handler并设置格式`
			`handler = logging.StreamHandler()`
			`formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')`
			`handler.setFormatter(formatter)`
			`logger.addHandler(handler)`
'commit' 1 month ago
'commit' 1 month ago			`def split_paragraphs(text):`
			`"""按段落分割文本"""`
			`# 按两个换行符分割段落`
'commit' 1 month ago			`paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]`
'commit' 1 month ago			`return paragraphs`
'commit' 1 month ago
			`def save_to_es(text):`
			`"""保存向量化文本和原始文本到ES"""`
'commit' 1 month ago			`vector = text_to_embedding(text)`
'commit' 1 month ago
'commit' 1 month ago			`if vector is None:`
			`logger.warning(f"跳过无法生成向量的文本: {text}")`
			`return`
'commit' 1 month ago
			`doc = {`
			`'text': text,`
'commit' 1 month ago			`'vector': vector,`
			`'timestamp': datetime.datetime.now().isoformat(),`
			`'analyzer': 'ik_smart'`
'commit' 1 month ago			`}`
'commit' 1 month ago
			`try:`
			`es.index(index='knowledge_base', body=doc)`
			`es.index(index='raw_texts', body={'raw_text': text})`
			`except Exception as e:`
			`logger.error(f"保存文本到ES失败: {e}")`
'commit' 1 month ago
			`def process_file(file_path):`
			`"""处理文本文件"""`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`content = f.read()`

'commit' 1 month ago			`paragraphs = split_paragraphs(content)`
'commit' 1 month ago
'commit' 1 month ago			`for paragraph in tqdm(paragraphs, desc='处理进度', unit='段'):`
			`save_to_es(paragraph)`
'commit' 1 month ago
'commit' 1 month ago			`print(f"\n处理完成，共保存{len(paragraphs)}个段落")`
'commit' 1 month ago
			`if __name__ == '__main__':`
			`es = Elasticsearch(`
			`hosts=[ES_CONFIG['hosts']],`
			`basic_auth=ES_CONFIG['basic_auth'],`
			`verify_certs=ES_CONFIG['verify_certs'],`
			`ssl_show_warn=ES_CONFIG['ssl_show_warn']`
			`)`

'commit' 1 month ago			`file_path = '../Txt/人口变化趋势对云南教育的影响.txt'`
'commit' 1 month ago			`process_file(file_path)`