import re
import ssl
import urllib # 制定url,获取网页数据
import urllib.request
# xwlt是一个帮助我们写入一个excel表的库
import xlwt
from bs4 import BeautifulSoup
# 忽略https证书
ssl._create_default_https_context = ssl._create_unverified_context
# 标题
findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
# 解析数据
# findLink = r'<a href="(.*?)">'
# 链接
findLink = re.compile(r'<a href="(.*?)">')
# 图片
findImg = re.compile(r'<img.*src="(.*?)"', re.S) # re.S让换行符包含其中
# 其他内容
findOther = re.compile(r'<p class="">(.*)</p>', re.S) # re.S让换行符包含其中
# 评分
findRate = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 评价人数
findCom = re.compile(r'<span>(\d*?)人评价</span>')
# 描述
findDes = re.compile(r'<span class="inq">(.*?)</span>')
savePath = "豆瓣电影Top250.xls"
dataList = []
def getData(baseUrl):
for i in range(0, 10):
url = baseUrl + str(i * 25)
html = askUrl(url)
# print(html)
resolving(html)
# 解析数据
return dataList
# 请求数据
def askUrl(url):
html = ""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
# print(html)
return html
except urllib.error.HTTPError as e:
if hasattr(e, "code"): # 如果包含code标签则打印出来
print(e.code)
if hasattr(e, "reason"): # 如果包含reason标签则打印出来
print(e.reason)
return html
# 解析数据
def resolving(html):
bs = BeautifulSoup(html, "html.parser")
for item in bs.find_all("div", class_="item"):
data = []
# print("每一个条目数据", item)
# print(type(item))
item = str(item)
title = re.findall(findTitle, item)
if len(title) >= 2:
data.append(title[0])
data.append(title[1])
else:
data.append(title[0])
data.append(' ')
data.append(findRule(findLink, item))
data.append(findRule(findImg, item))
# other = findRule(findOther, item)
# data.append(other)
data.append(findRule(findRate, item))
data.append(findRule(findCom, item))
data.append(findRule(findDes, item))
dataList.append(data)
return dataList
# 保存数据到excel中
def saveExcel(dataList):
# 创建一个excel表,编码用utf-8
workbook = xlwt.Workbook(encoding="utf-8")
# 创建一个excel文档中的sheet1文件
worksheet = workbook.add_sheet('豆瓣电影')
# 在sheet1中的第0行,第一列写入 你好,excel
col = ("标题", "其他标题", "链接", "图片", "评分", "评价人数", "描述")
for i in range(0, len(col)):
worksheet.write(0, i, col[i])
for i in range(0, len(dataList)):
item = dataList[i]
for j in range(0, len(item)):
worksheet.write(i + 1, j, item[j])
# 保存excle文档在本地,起名叫做movie.xls
workbook.save('movie.xls')
def findRule(rule, msg):
list = re.findall(rule, msg)
# if (len(list)) >= 1:
# return list[0]
# else:
# return " "
return list
def start():
getData("https://movie.douban.com/top250?start=")
saveExcel(dataList)
start()
python之爬取豆瓣Top250数据
最新推荐文章于 2023-09-17 22:32:58 发布