You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
3.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
def search_sina_news(keyword='新闻', time='d', page=1):
"""
搜索新浪新闻
:param keyword: 搜索关键字,默认为'新闻'
:param time: 时间范围,默认为'd'(一天内)
:param page: 页码默认为1第一页
:return: 返回抓取到的搜索结果列表
"""
results = [] # 存放抓取到搜索结果的列表
order = 0 # 抓取到的搜索结果序号
url = f'https://search.sina.com.cn/news?c=news&adv=1&q={keyword}&time={time}&size=20&page={str(page)}'
"""
参数说明
c=news news-新闻
adv=1 1-高级搜索模式; 0-普通搜索模式
q 关键字
time 时间范围h-一个小时内d-一天内w-一周内m-一个月内;年份数字(如2023、2024)-表示限定指定的年份内
size=20 每页显示的结果数量范围10-20
page 当前抓取的页号1-第一页2-第二页,依此类推
"""
# 发送get请求
response = requests.get(url)
response.encoding = 'utf-8'
# 检查请求是否成功
if response.status_code == 200:
# 解析响应内容
soup = BeautifulSoup(response.text, 'html.parser')
if page == 1: # 只有第一页是才抓取
try:
# 获取总的搜索结果信息
news_number = soup.find('div', 'l_v2').text
print(news_number)
except Exception as e:
print(e)
# 抓取当前页面中的搜索结果
result_blocks = soup.find_all('div', class_='box-result clearfix')
print(f'{page}页抓取到的搜索结果数量为{len(result_blocks)}')
# 从result_blocks列表中提取有效的数据
for block in result_blocks:
order += 1
title = block.find('a').text # 获取标题
link = block.find('a')['href'] # 获取链接
# 获取包含内容摘要、来源、发布时间的信息块
infos = block.find('div', class_='r-info')
# 获取内容
content = infos.find('p', class_='content').text
# 获取来源和发布时间
source_time = infos.find('span').text
st_list = source_time.split()
source = st_list[0]
if len(st_list) > 2:
time = st_list[1] + ' ' + st_list[2] # 时间格式为yyyy-mm-dd hhmmss
else:
time = st_list[1] # 时间格式为XX小时前
results.append({
'order': order,
'title': title,
'link': link,
'content': content,
'source': source,
'time': time
})
# 在屏幕上输出抓取到的信息
print(order, ". ", title)
print(link)
print(content)
print(source, ' ', time)
print(" ")
print(f'本次共抓取到的搜索结果共{len(results)}')
return results
else:
print('status_code!=200, 不能解析内容')
return []
# 示例调用
if __name__ == '__main__':
# 默认抓取第一页
results = search_sina_news(keyword='NBA', time='d', page=1)
# 如果需要抓取其他页可以传入page参数
# results = search_sina_news(keyword='NBA', time='d', page=2)