main
HuangHai 1 week ago
parent 9ecaf90164
commit 76d0c09bdd

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.10 (4)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="D:\anaconda3\envs\py310" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="PyDocumentationSettings"> <component name="PyDocumentationSettings">

@ -3,5 +3,5 @@
<component name="Black"> <component name="Black">
<option name="sdkName" value="D:\anaconda3\envs\lightrag" /> <option name="sdkName" value="D:\anaconda3\envs\lightrag" />
</component> </component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (4)" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="D:\anaconda3\envs\py310" project-jdk-type="Python SDK" />
</project> </project>

File diff suppressed because one or more lines are too long

@ -4,21 +4,34 @@
# 微信爬爬猫---公众号文章抓取代码分析 # 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268 # https://blog.csdn.net/yajuanpi4899/article/details/121584268
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import asyncio import asyncio
import datetime import datetime
import json import json
import logging import logging
import random import random
import re import re
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
import requests import requests
from Util.PostgreSQLUtil import init_postgres_pool from Util.PostgreSQLUtil import init_postgres_pool
from Util.WxGzhUtil import init_wechat_browser, get_article_content from Util.WxGzhUtil import init_wechat_browser, get_article_content
# 删除重复的日志配置,只保留以下内容 # 删除重复的日志配置,只保留以下内容
logger = logging.getLogger('WeiXinGongZhongHao') logger = logging.getLogger('WxGzh')
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
# 确保只添加一个handler # 确保只添加一个handler
@ -27,7 +40,6 @@ if not logger.handlers:
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler) logger.addHandler(handler)
async def get_wechat_sources(): async def get_wechat_sources():
"""从t_wechat_source表获取微信公众号列表""" """从t_wechat_source表获取微信公众号列表"""
try: try:
@ -39,32 +51,13 @@ async def get_wechat_sources():
await pool.close() await pool.close()
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
async def is_article_exist(pool, article_url): async def is_article_exist(pool, article_url):
"""检查文章URL是否已存在数据库中""" """检查文章URL是否已存在数据库中"""
try: try:
async with pool.acquire() as conn: async with pool.acquire() as conn:
row = await conn.fetchrow(''' row = await conn.fetchrow('''
SELECT 1 SELECT 1 FROM t_wechat_articles WHERE url = $1 LIMIT 1
FROM t_wechat_articles
WHERE url = $1 LIMIT 1
''', article_url) ''', article_url)
return row is not None return row is not None
except Exception as e: except Exception as e:

Loading…
Cancel
Save