|
|
import asyncio
|
|
|
import logging
|
|
|
import time
|
|
|
|
|
|
from utils.Database import *
|
|
|
from utils.DocxUtil import get_docx_content_by_pandoc
|
|
|
from utils.LightRagUtil import initialize_pg_rag
|
|
|
|
|
|
# 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。
|
|
|
WORKING_DIR = f"./output"
|
|
|
|
|
|
# 后台任务,监控是否有新的未训练的文档进行训练
|
|
|
async def train_document_task():
|
|
|
print("线程5秒后开始运行【监控是否有新的未训练的文档进行训练】")
|
|
|
num = 1
|
|
|
await asyncio.sleep(5) # 使用 asyncio.sleep 而不是 time.sleep
|
|
|
# 这里放置你的线程逻辑
|
|
|
while True:
|
|
|
# 这里可以放置你的线程要执行的代码
|
|
|
logging.info("开始查询是否有未训练的文档:" + str(num))
|
|
|
num = num + 1
|
|
|
no_train_document_sql: str = " SELECT * FROM t_ai_teaching_model_document WHERE is_deleted = 0 and train_flag = 0 ORDER BY create_time DESC"
|
|
|
no_train_document_result = await find_by_sql(no_train_document_sql, ())
|
|
|
if not no_train_document_result:
|
|
|
logging.info("没有未训练的文档")
|
|
|
else:
|
|
|
logging.info("存在未训练的文档" + str(len(no_train_document_result))+"个")
|
|
|
# document = no_train_document_result[0]
|
|
|
# print("开始训练文档:" + document["document_name"])
|
|
|
# theme = await find_by_id("t_ai_teaching_model_theme", "id", document["theme_id"])
|
|
|
# # 训练开始前,更新训练状态
|
|
|
# update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 1 WHERE id = " + str(document["id"])
|
|
|
# execute_sql(update_sql)
|
|
|
# document_name = document["document_name"] + "." + document["document_suffix"]
|
|
|
# logging.info("开始训练文档:" + document_name)
|
|
|
# workspace = theme["short_name"]
|
|
|
# docx_name = document_name
|
|
|
# docx_path = document["document_path"]
|
|
|
# logging.info(f"开始处理文档:{docx_name}, 还有%s个文档需要处理!", len(no_train_document_result) - 1)
|
|
|
# # 训练代码开始
|
|
|
# try:
|
|
|
# rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace)
|
|
|
# # 获取docx文件的内容
|
|
|
# content = get_docx_content_by_pandoc(docx_path)
|
|
|
# await rag.insert(input=content, file_paths=[docx_name])
|
|
|
# finally:
|
|
|
# if rag:
|
|
|
# await rag.finalize_storages()
|
|
|
# # 训练结束,更新训练状态
|
|
|
# update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 2 WHERE id = " + str(document["id"])
|
|
|
# execute_sql(update_sql)
|
|
|
|
|
|
# 添加适当的等待时间,避免频繁查询
|
|
|
await asyncio.sleep(60) # 每分钟查询一次
|