diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 207f5bbd..99ec2cfa 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -47,16 +47,34 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService +async def is_article_exist(pool, article_url): + """检查文章URL是否已存在数据库中""" + try: + async with pool.acquire() as conn: + row = await conn.fetchrow(''' + SELECT 1 FROM t_wechat_articles + WHERE url = $1 LIMIT 1 + ''', article_url) + return row is not None + except Exception as e: + logging.error(f"检查文章存在性失败: {e}") + return False # 出错时默认返回False,避免影响正常流程 + + async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id): + # 先检查文章是否已存在 + if await is_article_exist(pool, article_url): + logging.info(f"文章已存在,跳过保存: {article_url}") + return + try: async with pool.acquire() as conn: - # 更安全的account_id转换逻辑 await conn.execute(''' - INSERT INTO t_wechat_articles - (title, source, url, publish_time, content, source_id) - VALUES ($1, $2, $3, $4, $5, $6) - ''', article_title, account_name, article_url, - publish_time, content, id) + INSERT INTO t_wechat_articles + (title, source, url, publish_time, content, source_id) + VALUES ($1, $2, $3, $4, $5, $6) + ''', article_title, account_name, article_url, + publish_time, content, id) except Exception as e: logging.error(f"保存文章失败: {e}")