'commit'

1 week ago · be105894af
parent 351caa4b5b
commit be105894af
1 changed files with 82 additions and 12 deletions
--- a/dsLightRag/Test/TestCrawl.py
+++ b/dsLightRag/Test/TestCrawl.py
@ -1,6 +1,11 @@
 # 详解（一）Python + Selenium 批量采集微信公众号，搭建自己的微信公众号每日AI简报，告别信息焦虑
 # https://blog.csdn.net/k352733625/article/details/149222945
+
+# 微信爬爬猫---公众号文章抓取代码分析
+# https://blog.csdn.net/yajuanpi4899/article/details/121584268
+import datetime
 import logging
+import random
 import re

 import requests
@ -12,6 +17,10 @@ import requests
 # https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html
 # https://github.com/mozilla/geckodriver/releases

+# 3、Python爬虫实战系列：微信公众号文章爬取的5种技术方案总结及代码示例！
+# 方案5：微信公众号后台引用链接方式爬取
+# https://blog.csdn.net/Python_trys/article/details/146506009
+
 """
 # 查看selenium版本
 pip show selenium
@ -31,12 +40,14 @@ from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service as ChromeService
 import json

-
-
-
 if __name__ == '__main__':
    # 定义一个空的字典，存放cookies内容
-    post = {}
+    cookies = {}
+    # 设置headers
+    header = {
+        "HOST": "mp.weixin.qq.com",
+        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
+    }
    # 用webdriver启动谷歌浏览器
    logging.info("启动浏览器，打开微信公众号登录界面")
    options = Options()
@ -50,22 +61,23 @@ if __name__ == '__main__':
    # # 拿手机扫二维码！
    logging.info("请拿手机扫码二维码登录公众号")
    time.sleep(20)
+
    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
    driver.get('https://mp.weixin.qq.com/')
    # 获取cookies
    cookie_items = driver.get_cookies()
    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
    for cookie_item in cookie_items:
-        post[cookie_item['name']] = cookie_item['value']
+        cookies[cookie_item['name']] = cookie_item['value']

-    if "slave_sid" not in post:
+    if "slave_sid" not in cookies:
        logging.info("登录公众号失败，获取cookie失败")
        exit()
-    cookies = json.dumps(post)
+    # cookies = json.dumps(post)  # 注释掉这一行

    # 方法3：使用requests库发送请求获取重定向URL
    url = 'https://mp.weixin.qq.com'
-    response = requests.get(url=url, allow_redirects=False, cookies=post)
+    response = requests.get(url=url, allow_redirects=False, cookies=cookies)
    if 'Location' in response.headers:
        redirect_url = response.headers.get("Location")
        print("重定向URL:", redirect_url)
@ -75,7 +87,65 @@ if __name__ == '__main__':
            print("获取到的token:", token)
            logging.info("微信token:" + token)

-    #url = 'https://mp.weixin.qq.com'
-    #response = requests.get(url=url, allow_redirects=False, cookies=cookies)
-    #token = re.findall(r'token=(\d+)', str(response.headers.get("Location")))[0]
-    #logging.info("微信token:" + token)
+    article_urls = []
+    gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
+    for item in gzlist:
+        account_name = item["account_name"]
+        account_id = item["account_id"]
+        # 搜索微信公众号的接口地址
+        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
+        # 搜索微信公众号接口需要传入的参数，有三个变量：微信公众号token、随机数random、搜索的微信公众号名字
+        query_id = {
+            'action': 'search_biz',
+            'token': token,
+            'lang': 'zh_CN',
+            'f': 'json',
+            'ajax': '1',
+            'random': random.random(),
+            'query': account_name,
+            'begin': '0',
+            'count': '5'
+        }
+        # 打开搜索微信公众号接口地址，需要传入相关参数信息如：cookies、params、headers
+        search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
+        # 取搜索结果中的第一个公众号
+        lists = search_response.json().get('list')[0]
+        # 获取这个公众号的fakeid，后面爬取公众号文章需要此字段
+        fakeid = lists.get('fakeid')
+        logging.info("fakeid:" + fakeid)
+        # 微信公众号文章接口地址
+        appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
+        # 搜索文章需要传入几个参数：登录的公众号token、要爬取文章的公众号fakeid、随机数random
+        query_id_data = {
+            'token': token,
+            'lang': 'zh_CN',
+            'f': 'json',
+            'ajax': '1',
+            'random': random.random(),
+            'action': 'list_ex',
+            'begin': '0',  # 不同页，此参数变化，变化规则为每页加5
+            'count': '5',
+            'query': '',
+            'fakeid': fakeid,
+            'type': '9'
+        }
+        # 打开搜索的微信公众号文章列表页
+        query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
+        fakeid_list = query_fakeid_response.json().get('app_msg_list')
+        item = fakeid_list[0]
+        # 采集item示例
+        new_article = {
+            'title': item.get('title'),
+            'article_url': item.get('link'),
+            'account_id': account_id,
+            'account_name': account_name,
+            'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
+            'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        }
+        logging.info("new_article:", new_article)
+        article_urls.append(item.get('link'))
+        time.sleep(2)
+
+    for article_url in article_urls:
+        print("正在爬取文章：" + article_url)
+