# -*- coding:utf-8 -*-
"""
@Author: LZL
@File: Spider_sjzg.py
@Purpose: for Purpose(Need to fill in)
@CreateTime: 2021/9/8 18:15
@Software: PyCharm
@Thought: 对象思维
"""
import requests
import time
import os
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import random
#https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1631096170270_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=cat
from pyinstrument import Profiler
profiler=Profiler()
profiler.start()
class Get_Baidu:
def __init__(self):
self.search=input("请输入想要爬取的图片类型:")
self.url = "https://image.baidu.com/search/index?tn=baiduimage"
self.browser = webdriver.Chrome()
# 设置显示等待时间
self.wait = WebDriverWait(self.browser, 20)
# 设置最大化
self.browser.maximize_window()
self.count =0
self.img_name = ""
self.expression=["ヾ(^▽^*)))","(´v`)","(≧∇≦)ノ","o(^▽^)o","(o>ε(o>u(≧∩≦)","( ̄︶ ̄)↗","o(* ̄▽ ̄*)o","(p≧w≦q)","!*★,°*:.☆( ̄▽ ̄)/$:*.°★* 。","ㄟ(≧◇≦)ㄏ","(/≧▽≦)/","( ゚∀゚) ノ♡","o(* ̄︶ ̄*)o","(๑¯∀¯๑)","(≧∀≦)ゞ","φ(≧ω≦*)♪","╰(*°▽°*)╯","^O^","(*^▽^*)","ヽ(✿゚▽゚)ノ","(´▽`ʃ♡ƪ)","φ(゜▽゜*)♪","o(* ̄▽ ̄*)o","(u‿ฺu✿ฺ)",]
def main(self):
self.mkdir()
self.browser.get(self.url)
time.sleep(1)
self.wait.until(EC.presence_of_element_located((By.ID, "kw")))
self.browser.find_element_by_id("kw").send_keys(self.search)
time.sleep(1)
self.browser.find_element_by_class_name("s_newBtn").click()
self.get_img()
def mkdir(self):
if os.path.exists(self.search) !=True:
os.mkdir(self.search)
else:
print("目录文件已存在,请删除或重命名... 3秒后结束程序")
time.sleep(3)
exit(0)
def get_img(self):
divs = self.browser.find_elements_by_class_name("imgbox-border")
for div in divs:
if divs.index(div) >= self.count:
try:
div.click()
except:
print("****** 广告?(。ò ∀ ó。)反正吧他就是点击不了 ┭┮﹏┭┮ ****** ")
continue
windows=self.browser.window_handles
self.browser.switch_to_window(windows[1])
try:
img_url = self.browser.find_element_by_id('currentImg').get_attribute('src')
except:
self.browser.close()
windows = self.browser.window_handles
self.browser.switch_to_window(windows[0])
continue
if "http" not in img_url:
self.get_img()
r = requests.get(img_url)
img_name = self.search+str(self.count) + ".jpg"
if self.count%6 == 0:
print("累计爬取图片:%d 张 %s" % (self.count,self.expression[random.randint(0,len(self.expression)-1)]))
with open("%s/%s"%(self.search,img_name), 'wb', buffering=0) as f:
f.write(r.content)
self.count += 1
self.img_name = img_name
if self.count != 0:
while self.getfile(self.img_name):
time.sleep(2)
print("文件不存在!读取中.....")
self.browser.close()
windows = self.browser.window_handles
self.browser.switch_to_window(windows[0])
js = "var q=document.documentElement.scrollTop=100000"
self.browser.execute_script(js)
time.sleep(5)
# self.get_img()
def getfile(self, path):
my_file = Path("%s/%s"%(self.search,path))
print(os.path.join(self.search,path))
if my_file.is_file():
return False
else:
return True
print("****************************************")
print("*** 百度图片Spider 跟据输入内容进行爬取***")
print("*** 如需修改请联系 QQ:389131999 ***")
print("****************************************")
Get_Baidu().main()
profiler.stop()
profiler.print()

温酒往事·
- 粉丝: 343
最新资源
- 公益慈善电子商务平台项目建设方案.doc
- 网络应用基础在线考核.doc
- 三菱PLC与MCGS组态触摸屏在广场喷泉控制系统的集成应用解析
- 基于51单片机的GPS定位系统的设计.doc
- 网络公司电话销售话术.doc
- 系统集成项目管理工程师9大知识体系汇总.doc
- 综合布线标识设计方案.pptx
- 国家开放大学电大《思想道德修养与法律基础》网络核心课终结性考试三套试题及答案.docx
- 商业银行大数据建设规划.docx
- 数字电路后端设计逻辑综合.ppt
- 虚拟化方案-供参考.doc
- 2023年计算机二级语言笔试试卷.doc
- 秦皇岛二中校园网络视频直播方案成功案例.docx
- 公司项目管理手册实施细则.doc
- 网络营销概要.pptx
- 六自由度系统集成设计(一)PPT课件.ppt
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈


