You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
2.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import asyncio
import logging
import time
from utils.Database import *
from utils.DocxUtil import get_docx_content_by_pandoc
from utils.LightRagUtil import initialize_pg_rag
# 使用PG库后这个是没有用的,但目前的项目代码要求必传,就写一个吧。
WORKING_DIR = f"./output"
# 后台任务,监控是否有新的未训练的文档进行训练
async def train_document_task():
print("线程5秒后开始运行【监控是否有新的未训练的文档进行训练】")
num = 1
await asyncio.sleep(5) # 使用 asyncio.sleep 而不是 time.sleep
# 这里放置你的线程逻辑
while True:
# 这里可以放置你的线程要执行的代码
logging.info("开始查询是否有未训练的文档:" + str(num))
num = num + 1
no_train_document_sql: str = " SELECT * FROM t_ai_teaching_model_document WHERE is_deleted = 0 and train_flag = 0 ORDER BY create_time DESC"
no_train_document_result = await find_by_sql(no_train_document_sql, ())
if not no_train_document_result:
logging.info("没有未训练的文档")
else:
logging.info("存在未训练的文档" + str(len(no_train_document_result))+"")
# document = no_train_document_result[0]
# print("开始训练文档:" + document["document_name"])
# theme = await find_by_id("t_ai_teaching_model_theme", "id", document["theme_id"])
# # 训练开始前,更新训练状态
# update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 1 WHERE id = " + str(document["id"])
# execute_sql(update_sql)
# document_name = document["document_name"] + "." + document["document_suffix"]
# logging.info("开始训练文档:" + document_name)
# workspace = theme["short_name"]
# docx_name = document_name
# docx_path = document["document_path"]
# logging.info(f"开始处理文档:{docx_name}, 还有%s个文档需要处理", len(no_train_document_result) - 1)
# # 训练代码开始
# try:
# rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace)
# # 获取docx文件的内容
# content = get_docx_content_by_pandoc(docx_path)
# await rag.insert(input=content, file_paths=[docx_name])
# finally:
# if rag:
# await rag.finalize_storages()
# # 训练结束,更新训练状态
# update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 2 WHERE id = " + str(document["id"])
# execute_sql(update_sql)
# 添加适当的等待时间,避免频繁查询
await asyncio.sleep(60) # 每分钟查询一次