import requests
from lxml import etree
目标链接,请求头
url = 'https://www.qiushibaike.com/text/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'
}
获取糗事标题,内容,url
def get_page_content():
url_page= 'https://www.qiushibaike.com/article/123448116'
response = requests.get(url_page,headers=headers)
response.encoding = 'utf-8'
result = etree.HTML(response.text)
content ={}
content['title'] = result.xpath('//*[@id="content"]/div/div[2]/h1/text()')
content['url'] = url_page
content['content'] = result.xpath('//*[@id="single-next-link"]/div/text()')[0].strip()
return content
获取每页中糗事的链接.
def get_url_list(url):
response = requests.get(url,headers=headers)
result = etree.HTML(response.text)
urls = result.xpath('//div[@id="content"]/div/div[2]/div/a/@href')
url_lists = []
for url in urls:
u = 'https://www.qiushibaike.com' + url
url_lists.append(u)
return url_lists
if __name__ == '__main__':
# 段子总共有13页这里用for循环生成下翻页链接
lists = []
for i in range(1,14):
url = "https://www.qiushibaike.com/text/page/{}/".format(i)
lists += get_url_list(url)
for url in lists:
print(get_page_content())