在页面间切换
适用与页面中点开链接出现新的页面的网站,但是浏览器对象browser还是之前页面的对象
window_handles = driver.window_handles
driver.switch_to.window(window_handles[-1])
保存网页截图
driver.save_screenshot('screen.png')
执行JavaScript
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
前进和后退
driver.back()
driver.forward()
Cookie处理
获取、添加、删除Cookies
driver.get_cookies()
driver.add_cookie({'name': 'name', 'domain': 'www.zhihu.com', 'value': 'germey'})
driver.delete_all_cookies()
谷歌无头浏览器
from selenium.webdriver.chrome.options import Options。
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)
规避监测
相关的网站会对selenium发起的请求进行监测,网站后台可以根据window.navigator.webdriver返回值进行selenium的监测,若返回值为undefinded,则不是selenium进行的请求发送;若为true,则是selenium发起的请求。
规避监测的方法:
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='chromedriver.exe',options=option)
切换子框架
此操作主要作用与 ifram子框架 的互相切换使用
iframe = driver.find_element_by_xxx('')
driver.switch_to_frame(节点对象)
不请求图片模式
只需要如下设置则不会请求图片, 会加快效率
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)
案例演练:获取豆瓣电影中更多电影详情数据(谷歌无头浏览器)
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.options import Options
第1步:下面三行固定
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
第2步:把chrome_options对象作为参数
bro = webdriver.Chrome(chrome_options=chrome_options)
bro.get(url)
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
page_text = bro.page_source
with open('./douban.html','w',encoding='utf-8') as fp:
fp.write(page_text)
print(page_text)
sleep(1)
bro.quit()
案例演练:登录qq空间
在web 中,经常会遇到frame 嵌套页面的应用,使用WebDriver 每次只能在一个页面上识别元素,对于frame 嵌套内的页面上的元素,直接定位是定位是定位不到的。这个时候就需要通过switch_to_frame()方法将当前定位的主体切换了frame 里。先定位到iframe,再在iframe中进行标签定位。否则,定位不到我们想要的标签
import requests
from selenium import webdriver
from lxml import etree
import time
driver = webdriver.Chrome(executable_path=r'C:\Users\Administrator\chromedriver.exe')
driver.get('https://qzone.qq.com/')
#switch_to操作切换frame,此时才能进行登陆页面的操作。
driver.switch_to.frame('login_frame')
#点击使用账号密码登陆,需要绑定click事件
driver.find_element_by_id('switcher_plogin').click()
#driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys('QQ')
#driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys('密码')
#点击登陆,绑定click事件
driver.find_element_by_id('login_button').click()
time.sleep(2)
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
page_text = driver.page_source #获取页面源码数据,注意page_source无括号。
tree = etree.HTML(page_text) #执行解析操作
li_list = tree.xpath('//ul[@id="feed_friend_list"]/li')
for li in li_list:
text_list = li.xpath('.//div[@class="f-info"]//text()|.//div[@class="f-info qz_info_cut"]//text()')
text = ''.join(text_list)
print(text+'\n\n\n')
driver.quit()
发现小框是嵌套在大框里面的,在当前的html源码中,又嵌套了一个html子页面,这个子页面是包含在iframe标签中的。所以,如果定位的标签是存在于iframe中的,那么一定需要使用switch to函数,将当前浏览器页面的参照物切换到iframe中,iframe中有一个id为login_frame的属性值,可以根据它来定位。
案例演练:12306抢票(武汉-->上海)【后面可以调整让用户输入出发地和目的地】
from selenium import webdriver
from user import username,password
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains # ActionChains用来模拟鼠标操作 比如单击、双击、点击鼠标右键、拖拽等等
import time
from selenium.webdriver.common.keys import Keys
# 实现无可视化界面
from selenium.webdriver.chrome.options import Options
# selenium防检测
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
print('开始抢购。。。')
url = 'https://kyfw.12306.cn/otn/resources/login.html'
# 打开浏览器
driver = webdriver.Chrome(executable_path='F:\PyCharm\chromedriver.exe',chrome_options=chrome_options)
# 访问网址
driver.get(url)
# 输入12306账号
driver.find_element(By.ID,'J-userName').send_keys(username)
# 输入12306密码
driver.find_element(By.ID,'J-password').send_keys(password)
# driver.find_element_by_css_selector('#J-password').sendkeys(password)
# 点击登录
driver.find_element(By.ID,'J-login').click()
time.sleep(1)
# btn = driver.find_element(By.ID,'nc_1_n1z')
# ActionChains(driver).drag_and_drop_by_offset(btn,300,0).perform()
# 滑块儿验证
while True:
try:
span = driver.find_element(By.ID,"nc_1_n1z")
actions = ActionChains(driver) # 行为链实例化
# time.sleep(1) # 等待1秒钟
# 经测量,滑块需要滑过的距离为300像素
actions.click_and_hold(span).move_by_offset(300, 0).perform() # 滑动
# 释放行为链
actions.release()
time.sleep(1)
a = driver.find_element(By.ID,"nc_1_refresh1") # 查找刷新按钮,如果没有说明登录成功,执行except跳出循环
a.click() # 如果刚刚滑动失败,则点击刷新,重新滑动
except Exception as e:
print(e)
break
time.sleep(1)
# 点击弹窗确定
driver.find_element(By.XPATH,'//a[contains(text(),"确定")]').click()
# 点击车票预定
driver.find_element(By.ID,'link_for_ticket').click()
time.sleep(1)
# 点击弹窗确认
driver.find_element(By.ID,'qd_closeDefaultWarningWindowDialog_id').click()
# 选择出发地并清空
driver.find_element(By.ID,'fromStationText').click()
driver.find_element(By.ID,'fromStationText').clear()
# 输入出发地
# driver.execute_script("arguments[0].value='%s'" % '武汉', driver.find_element(By.ID,'fromStationText'))
driver.find_element(By.ID,'fromStationText').send_keys('武汉')
driver.find_element(By.ID,'fromStationText').send_keys(Keys.ENTER)
# 选择目的地并清空
driver.find_element(By.ID,'toStationText').click()
driver.find_element(By.ID,'toStationText').clear()
# 输入目的地 toStationText
# driver.execute_script("arguments[0].value='%s'" % '北京', driver.find_element(By.ID,'toStationText'))
driver.find_element(By.ID,'toStationText').send_keys('上海')
driver.find_element(By.ID,'toStationText').send_keys(Keys.ENTER)
# 选择出发时间
driver.find_element(By.ID,'train_date').clear()
driver.find_element(By.ID,'train_date').send_keys('2022-10-22')
# 点击查询
driver.find_element(By.ID,'query_ticket').click()
# 定位到所有的车票行
# trs = driver.find_element(By.XPATH,'//tbody[@id="queryLeftTable"]/tr')
# # # 检测是否能够预约
# # for tr in trs:
# # tr_id = tr.get('id')
# # tds = driver.find_element(By.XPATH,f'//tr[@id={tr_id}]/td'.format()
time.sleep(1)
# 点击预约
driver.find_element(By.XPATH,'//tbody[@id="queryLeftTable"]/tr[1]/td[13]/a').click()
time.sleep(1)
# 选择乘车人
driver.find_element(By.XPATH,'//ul[@id="normal_passenger_id"]/li[1]/input').click()
# 提交订单
driver.find_element(By.ID,'submitOrder_id').click()
time.sleep(1)
# 选择座位
driver.find_element(By.XPATH,'//div[@id="erdeng1"]/ul[2]/li[2]/a').click()
time.sleep(2)
# 点击确认
driver.find_element(By.ID,'qr_submit_id').click()
案例演练:利用搜狗搜索接口抓取微信公众号(无头、规避检测、等待、切换页面)
# 添加启动参数 (add_argument)
# 添加实验性质的设置参数 (add_experimental_option)
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
from lxml import etree
option = webdriver.ChromeOptions()
option.add_argument('headless')
#设置chromedriver启动参数,规避对selenium的检测机制
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=option)
url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=python_shequ'
driver.get(url)
print(driver.title)
timeout = 5
link = WebDriverWait(driver, timeout).until(
lambda d: d.find_element_by_link_text('Python爱好者社区'))
link.click()
time.sleep(1)
# 切换页面
window_handles = driver.window_handles
driver.switch_to.window(window_handles[-1])
print(driver.title)
article_links = WebDriverWait(driver, timeout).until(
# EC.presence_of_element_located((By.XPATH, '//h4[@class="weui_media_title"]'))
lambda d: d.find_elements_by_xpath('//h4[@class="weui_media_title"]'))
article_link_list = []
for item in article_links:
article_link = 'https://mp.weixin.qq.com' + item.get_attribute('hrefs')
# print(article_link)
article_link_list.append(article_link)
print(article_link_list)
first_article_link = article_link_list[0]
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
}
response = requests.get(first_article_link,
headers=header,
timeout=5
)
tree = etree.HTML(response.text)
title = tree.xpath('//h2[@id="activity-name"]/text()')[0].strip()
content = tree.xpath('//div[@id="js_content"]//text()')
content = ''.join(content).strip()
print(title)
print(content)