commit by Kalman.CHENG ☆

This commit is contained in:
chengminglong
2025-08-20 11:07:09 +08:00
parent a8917f879b
commit b3c22eccea
2 changed files with 74 additions and 58 deletions

View File

@@ -47,19 +47,24 @@ async def train_document_task():
logging.info(f"开始处理文档:{document_name}, 还有{len(no_train_document_result) - 1}个文档需要处理!")
# 训练代码开始
# content = get_docx_content_by_pandoc(document_path)
train_result = True
try:
# 注意默认设置使用NetworkX
rag = await initialize_rag(working_dir)
# 获取docx文件的内容
content = get_docx_content_by_pandoc(document_path)
await rag.ainsert(content, ids=[document_name], file_paths=[document_name])
logger.info(f"Inserted content from {document_name}")
if content is not None:
await rag.ainsert(content, ids=[document_name], file_paths=[document_name])
logger.info(f"Inserted content from {document_name}")
else:
train_result = False
except Exception as e:
logger.error(f"An error occurred: {e}")
finally:
await rag.finalize_storages()
# 训练结束,更新训练状态
update_document_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 2 WHERE id = " + str(document["id"])
train_flag = "2" if train_result else "7"
update_document_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = " + train_flag + " WHERE id = " + str(document["id"])
await execute_sql(update_document_sql, ())
elif document["train_flag"] == 3:
update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 4 WHERE id = " + str(document["id"])
@@ -80,22 +85,30 @@ async def train_document_task():
await execute_sql(update_document_sql, ())
# 整体更新主题状态
select_document_sql: str = f"select train_flag, count(1) as train_count from t_ai_teaching_model_document where theme_id = {theme['id']} and is_deleted = 0 and train_flag in (0,1,2) group by train_flag"
select_document_sql: str = f"select train_flag, count(1) as train_count from t_ai_teaching_model_document where theme_id = {theme['id']} and is_deleted = 0 and train_flag in (0,1,2,7) group by train_flag"
select_document_result = await find_by_sql(select_document_sql, ())
train_document_count_map = {}
for item in select_document_result:
train_document_count_map[str(item["train_flag"])] = int(item["train_count"])
train_document_count_0 = train_document_count_map.get("0", 0)
train_document_count_1 = train_document_count_map.get("1", 0)
train_document_count_2 = train_document_count_map.get("2", 0)
train_document_count_7 = train_document_count_map.get("7", 0)
search_flag = 0
if train_document_count_2 > 0:
update_theme_sql: str = f"UPDATE t_ai_teaching_model_theme SET search_flag = 1, train_flag = 2 WHERE id = {theme['id']}"
search_flag = 1
# 训练未开始:初始化| 或者没有is_deleted=0的文档
if train_document_count_0 == 0 and train_document_count_1 == 0 and train_document_count_2 == 0 and train_document_count_7 == 0:
update_theme_sql: str = f"UPDATE t_ai_teaching_model_theme SET search_flag = {search_flag}, train_flag = 0 WHERE id = {theme['id']}"
await execute_sql(update_theme_sql, ())
else:
if train_document_count_1 > 0:
update_theme_sql: str = f"UPDATE t_ai_teaching_model_theme SET search_flag = 0, train_flag = 1 WHERE id = {theme['id']}"
await execute_sql(update_theme_sql, ())
else:
update_theme_sql: str = f"UPDATE t_ai_teaching_model_theme SET search_flag = 0, train_flag = 0 WHERE id = {theme['id']}"
await execute_sql(update_theme_sql, ())
# 训练进行中:单个文档训练中|同时存在训练完成+未训练的文档
if train_document_count_2 > 0 and (train_document_count_1 > 0 or train_document_count_7 > 0):
update_theme_sql: str = f"UPDATE t_ai_teaching_model_theme SET search_flag = {search_flag}, train_flag = 1 WHERE id = {theme['id']}"
await execute_sql(update_theme_sql, ())
# 训练已完成所有is_deleted=0的文档都训练完成
if train_document_count_2 > 0 and train_document_count_0 == 0 and train_document_count_1 == 0 and train_document_count_7 == 0:
update_theme_sql: str = f"UPDATE t_ai_teaching_model_theme SET search_flag = {search_flag}, train_flag = 2 WHERE id = {theme['id']}"
await execute_sql(update_theme_sql, ())
# 添加适当的等待时间,避免频繁查询
await asyncio.sleep(120) # 每二分钟查询一次