基础信息获取:
网站url:https://news.sina.com.cn/china/
然后我们需要爬取前五页的新闻,包括标题,时间和正文,使用json格式保存
代码实现:
import requests
from bs4 import BeautifulSoup
import json
all_news_data = []
a = 1
for a in range(1,6):
main_url = "https://feed.sina.com.cn/api/roll/get"
params = {'pageid': {'121'},
'lid': {'1356'},
'num': {20},
'page': a}
headers = {'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0'}
resp = requests.get(url=main_url,params=params,headers=headers)
json_data = resp.json()
news_data = json_data['result']['data']
# print(news_data)
for data in news_data:
# print(data)
# name = data['title'] #标题
url = data['url'] #url
# print(name,url)
news_resp = requests.get(url=url,headers=headers)
news_resp.encoding = 'utf-8'
page = BeautifulSoup(news_resp.text, features="html.parser")
news_title = page.select('h1.main-title')[0].text.strip()
news_body = page.find("div",attrs={"class":"article"}).text.strip()
news_body = news_body.replace('\n', '') # 替换换行符
news_date = page.find("span",attrs={"class":"date"}).text.strip()
# print(news_title,news_body,news_date)
# 将新闻信息存储为字典
news_info = {
"title": news_title,
"body": news_body,
"date": news_date
}
# 添加到所有书籍数据列表中
all_news_data.append(news_info)
with open("homework_2.json",'w',encoding="utf-8") as f:
json.dump(all_news_data,f,ensure_ascii=False, indent=4)
print("over!")