|
|
|
@ -4,21 +4,34 @@
|
|
|
|
|
# 微信爬爬猫---公众号文章抓取代码分析
|
|
|
|
|
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# 查看selenium版本
|
|
|
|
|
pip show selenium
|
|
|
|
|
4.34.2
|
|
|
|
|
|
|
|
|
|
# 查看Chrome浏览器版本
|
|
|
|
|
chrome://version/
|
|
|
|
|
138.0.7204.101 (正式版本) (64 位)
|
|
|
|
|
|
|
|
|
|
# 下载驱动包
|
|
|
|
|
https://googlechromelabs.github.io/chrome-for-testing/
|
|
|
|
|
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
|
|
|
|
|
"""
|
|
|
|
|
import asyncio
|
|
|
|
|
import datetime
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import random
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
from Util.PostgreSQLUtil import init_postgres_pool
|
|
|
|
|
from Util.WxGzhUtil import init_wechat_browser, get_article_content
|
|
|
|
|
|
|
|
|
|
# 删除重复的日志配置,只保留以下内容
|
|
|
|
|
logger = logging.getLogger('WeiXinGongZhongHao')
|
|
|
|
|
logger = logging.getLogger('WxGzh')
|
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
|
|
|
|
|
# 确保只添加一个handler
|
|
|
|
@ -27,7 +40,6 @@ if not logger.handlers:
|
|
|
|
|
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
|
|
|
|
logger.addHandler(handler)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def get_wechat_sources():
|
|
|
|
|
"""从t_wechat_source表获取微信公众号列表"""
|
|
|
|
|
try:
|
|
|
|
@ -39,32 +51,13 @@ async def get_wechat_sources():
|
|
|
|
|
await pool.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# 查看selenium版本
|
|
|
|
|
pip show selenium
|
|
|
|
|
4.34.2
|
|
|
|
|
|
|
|
|
|
# 查看Chrome浏览器版本
|
|
|
|
|
chrome://version/
|
|
|
|
|
138.0.7204.101 (正式版本) (64 位)
|
|
|
|
|
|
|
|
|
|
# 下载驱动包
|
|
|
|
|
https://googlechromelabs.github.io/chrome-for-testing/
|
|
|
|
|
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
|
|
|
|
|
"""
|
|
|
|
|
import time
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def is_article_exist(pool, article_url):
|
|
|
|
|
"""检查文章URL是否已存在数据库中"""
|
|
|
|
|
try:
|
|
|
|
|
async with pool.acquire() as conn:
|
|
|
|
|
row = await conn.fetchrow('''
|
|
|
|
|
SELECT 1
|
|
|
|
|
FROM t_wechat_articles
|
|
|
|
|
WHERE url = $1 LIMIT 1
|
|
|
|
|
SELECT 1 FROM t_wechat_articles WHERE url = $1 LIMIT 1
|
|
|
|
|
''', article_url)
|
|
|
|
|
return row is not None
|
|
|
|
|
except Exception as e:
|
|
|
|
|