Scrapy全栈数据爬取
import scrapy
from Mcdonalds.items import McdonaldsItem
class McdonaldsSpider(scrapy.Spider):
name = 'mcdonalds'
allowed_domains = ['www.mcdonalds.com.cn']
start_urls = ['https://www.mcdonalds.com.cn/index/McD/media-center/press-release']
base_url = 'https://www.mcdonalds.com.cn/news/corporate?page='
page_num = 2
def parse(self, response):
news_list = response.xpath('''//div[@class='news-center-list']/ul/li''')
for li in news_list:
title = li.xpath('''./h4/a/text()''')[0].extract()
time = li.xpath('''./time/text()''').extract_first()
item = McdonaldsItem()
item['title'] = title
item['time'] = time
yield item
if self.page_num <= 40:
new_url = self.base_url + str(self.page_num)
self.page_num += 1
yield scrapy.Request(new_url, callback=self.parse, )