python专用壁纸,Python《wallpaper abyss壁纸》

该博客主要展示了如何使用Python的requests、BeautifulSoup和selenium库进行网页抓取,从AlphaCoders网站下载壁纸。通过设置请求头以避免反盗链,遍历网页并分页处理,将图片保存到本地目录。同时,代码使用了多线程 ThreadPoolExecutor 来提高爬取效率。

import time

from concurrent.futures import ThreadPoolExecutor

import time

import os

import re

from urllib.parse import urlencode

import requests

from bs4 import BeautifulSoup

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

rootrurl = 'https://wall.alphacoders.com/'

save_dir = 'D:/estimages/'

headers = {

"Referer": rootrurl,

'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",

'Accept-Language': 'en-US,en;q=0.8',

'Cache-Control': 'max-age=0',

'Connection': 'keep-alive'

} ###设置请求的头部,伪装成浏览器

def saveOneImg(dir, img_url):

new_headers = {

"Referer": img_url,

'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",

'Accept-Language': 'en-US,en;q=0.8',

'Cache-Control': 'max-age=0',

'Connection': 'keep-alive'

} ###设置请求的头部,伪装成浏览器,实时换成新的 header 是为了防止403 http code问题,防止反盗链,

try:

img = requests.get(img_url, headers=new_headers) # 请求图片的实际URL

if (str(img).find('200') > 1):

with open(

'{}/{}.jpg'.format(dir, img_url.split('/')[-1].split('?')[0]), 'wb') as jpg: # 请求图片并写进去到本地文件

jpg.write(img.content)

print(img_url)

jpg.close()

return True

else:

return False

except Exception as e:

print('exception occurs: ' + img_url)

print(e)

return False

def getAllTags():

list = {}

html = BeautifulSoup(requests.get(rootrurl + 'finding_wallpapers.php', headers=headers).text,

features="html.parser")

div = html.find('div', {'class': 'row'}).find_all('div')[1]

a_s = div.find_all('a')[1:]

for a in a_s:

list[a.get('title').split(' ')[0]] = rootrurl + a.get('href')

return list

def getSubTitleName(str):

cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]") # 匹配不是中文、大小写、数字的其他字符

string1 = cop.sub('_', str) # 将string1中匹配到的字符替换成下划线字符

return string1

def getSubDir(p):

return getSubTitleName(p.find_all('a')[-1].get_text())

def getImgUrl(p):

list = p.find('img').get('src').split('-')

return list[0][:-5] + list[-1]

def processOnePage(tag, a_s, span):

for i in range(0 , len(a_s)):

subdir = getSubDir(span[i])

img = getImgUrl(a_s[i])

tmpDir = '{}{}/{}'.format(save_dir, tag, subdir)

if not os.path.exists(tmpDir):

os.makedirs(tmpDir)

saveOneImg(tmpDir, img)

pass

def oneSpider(tag, url):

# 获得total pages

html = BeautifulSoup(requests.get(url, headers=headers).text, features="html.parser")

total = int(html.find('ul', {'class': 'pagination'}).find_all('a')[-2].string)

a_s = html.find_all('div', {'class': 'boxgrid'})

span = html.find_all('span', {'class': 'thumb-info-big'})

print('----- current page is 1. ------')

processOnePage(tag, a_s, span)

for i in range(2, (total + 1)):

html = BeautifulSoup(requests.get('{}&page={}'.format(url, i), headers=headers).text,

features="html.parser")

a_s = html.find_all('div', {'class': 'boxgrid'})

span = html.find_all('span', {'class': 'thumb-info-big'})

print('----- current page is %d. ------' % i)

processOnePage(tag, a_s, span)

pass

if __name__ == '__main__':

taglist = getAllTags()

# print(taglist)

# 给每个标签配备一个线程

with ThreadPoolExecutor(max_workers=31) as t: # 创建一个最大容纳数量为20的线程池

for tag, url in taglist.items():

t.submit(oneSpider, tag, url)

# just for test

# oneSpider('Anime', 'https://wall.alphacoders.com/by_category.php?id=3&name=Anime+Wallpapers')

# test

# for tag, url in taglist.items():

# oneSpider(tag, url)

# 等待所有线程都完成。

while 1:

print('-------------------')

time.sleep(1)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值