1.关于selenium选择器xpath的使用:
1 from selenium import webdriver 2 3 driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 4 5 6 try: 7 # 隐式等待: 写在get请求前 8 driver.implicitly_wait(5) 9 10 driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html') 11 12 # 显式等待: 写在get请求后 13 # wait.until(...) 14 15 ''' 16 17 <html> 18 <head> 19 <base href='http://example.com/' /> 20 <title>Example website</title> 21 </head> 22 <body> 23 <div id='images'> 24 <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> 25 <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> 26 <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> 27 <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> 28 <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> 29 </div> 30 </body> 31 </html> 32 ''' 33 # 根据xpath语法查找元素 34 # / 从根节点开始找第一个 35 html = driver.find_element_by_xpath('/html') 36 # html = driver.find_element_by_xpath('/head') # 报错 37 print(html.tag_name) 38 39 # // 从根节点开始找任意一个节点 40 div = driver.find_element_by_xpath('//div') 41 print(div.tag_name) 42 43 # @ 44 # 查找id为images的div节点 45 div = driver.find_element_by_xpath('//div[@id="images"]') 46 print(div.tag_name) 47 print(div.text) 48 49 # 找到第一个a节点 50 a = driver.find_element_by_xpath('//a') 51 print(a.tag_name) 52 53 # 找到所有a节点 54 a_s = driver.find_elements_by_xpath('//a') 55 print(a_s) 56 57 # 找到第一个a节点的href属性 58 # get_attribute:获取节点中某个属性 59 a = driver.find_element_by_xpath('//a').get_attribute('href') 60 print(a) 61 62 finally: 63 driver.close()
2.selenium剩余更多操作:
1 from selenium import webdriver 2 from selenium.webdriver.common.keys import Keys 3 import time 4 5 driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 6 7 try: 8 driver.implicitly_wait(10) 9 # 1、往jd发送请求 10 driver.get('https://www.jd.com/') 11 # 找到输入框输入围城 12 input_tag = driver.find_element_by_id('key') 13 input_tag.send_keys('围城') 14 # 键盘回车 15 input_tag.send_keys(Keys.ENTER) 16 time.sleep(2) 17 # 找到输入框输入墨菲定律 18 input_tag = driver.find_element_by_id('key') 19 input_tag.clear() 20 input_tag.send_keys('墨菲定律') 21 # 找到搜索按钮点击搜索 22 button = driver.find_element_by_class_name('button') 23 button.click() 24 time.sleep(10) 25 26 finally: 27 driver.close() 28 29 30 ''' 31 获取cookies (了解) 32 ''' 33 from selenium import webdriver 34 import time 35 36 driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 37 38 try: 39 driver.implicitly_wait(10) 40 driver.get('https://www.zhihu.com/explore') 41 print(driver.get_cookies()) 42 43 time.sleep(10) 44 finally: 45 driver.close() 46 47 ''' 48 选项卡 49 ''' 50 #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键: 51 # ctrl+t等,最通用的就是js的方式 52 import time 53 from selenium import webdriver 54 55 browser = webdriver.Chrome() 56 try: 57 browser.get('https://www.baidu.com') 58 59 # execute_script: 执行javascrpit代码 60 # 弹窗操作 61 # browser.execute_script('alert("tank")') 62 # 新建浏览器窗口 63 browser.execute_script( 64 ''' 65 window.open(); 66 ''' 67 ) 68 time.sleep(1) 69 print(browser.window_handles) # 获取所有的选项卡 70 # 切换到第二个窗口 71 # 新: 72 browser.switch_to.window(browser.window_handles[1]) 73 # 旧: 74 # browser.switch_to_window(browser.window_handles[1]) 75 76 # 第二个窗口往淘宝发送请求 77 browser.get('https://www.taobao.com') 78 time.sleep(5) 79 80 # 切换到第一个窗口 81 browser.switch_to_window(browser.window_handles[0]) 82 browser.get('https://www.sina.com.cn') 83 84 time.sleep(10) 85 finally: 86 browser.close() 87 88 89 ''' 90 ActionChangs动作链 91 ''' 92 from selenium import webdriver 93 from selenium.webdriver import ActionChains 94 import time 95 96 driver = webdriver.Chrome() 97 driver.implicitly_wait(10) 98 driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') 99 100 try: 101 102 # driver.switch_to_frame('iframeResult') 103 # 切换到id为iframeResult的窗口内 104 driver.switch_to.frame('iframeResult') 105 106 # 源位置 107 draggable = driver.find_element_by_id('draggable') 108 109 # 目标位置 110 droppable = driver.find_element_by_id('droppable') 111 112 # 调用ActionChains,必须把驱动对象传进去 113 # 得到一个动作链对象,复制给一个变量 114 actions = ActionChains(driver) 115 116 # 方式一: 机器人 117 # 瞬间把源图片位置秒移到目标图片位置 118 # actions.drag_and_drop(draggable, droppable) # 编写一个行为 119 # actions.perform() # 执行编写好的行为 120 121 122 # 方式二: 模拟人的行为 123 source = draggable.location['x'] 124 target = droppable.location['x'] 125 print(source, target) 126 127 distance = target - source 128 print(distance) 129 130 # perform:每个动作都要调用perform执行 131 132 # 点击并摁住源图片 133 ActionChains(driver).click_and_hold(draggable).perform() 134 135 s = 0 136 while s < distance: 137 # 执行位移操作 138 ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform() 139 s += 2 140 141 # 释放动作链 142 ActionChains(driver).release().perform() 143 144 time.sleep(10) 145 146 147 finally: 148 driver.close() 149 150 151 ''' 152 前进、后退 153 ''' 154 from selenium import webdriver 155 import time 156 157 driver = webdriver.Chrome() 158 159 try: 160 driver.implicitly_wait(10) 161 driver.get('https://www.jd.com/') 162 driver.get('https://www.baidu.com/') 163 driver.get('https://www.cnblogs.com/') 164 165 time.sleep(2) 166 167 # 回退操作 168 driver.back() 169 time.sleep(1) 170 # 前进操作 171 driver.forward() 172 time.sleep(1) 173 driver.back() 174 time.sleep(10) 175 176 finally: 177 driver.close()
3.破解登录:
1 from selenium import webdriver 2 from selenium.webdriver import ChromeOptions 3 import time 4 r''' 5 步骤: 6 1、打开文件的查看,显示隐藏文件 7 2、找到C:\Users\administortra\AppData\Local\Google\Chrome\User Data 8 删除Default文件 9 3、重新打开浏览器,并登陆百度账号 10 - 此时会创建一个新的Default缓存文件 11 4、添加cookies 12 5、关闭谷歌浏览器后执行程序 13 ''' 14 # 获取options对象,参数对象 15 options = ChromeOptions() 16 17 # 获取cookies保存路径 18 # 'C:\Users\administortra\AppData\Local\Google\Chrome\User Data' 19 profile_directory = r'--user-data-dir=C:\Users\administortra\AppData\Local\Google\Chrome\User Data' 20 21 # 添加用户信息目录 22 options.add_argument(profile_directory) 23 24 # 把参数加载到当前驱动中 chrome_options默认参数,用来接收options对象 25 driver = webdriver.Chrome(chrome_options=options) 26 27 try: 28 driver.implicitly_wait(10) 29 driver.get('https://www.baidu.com/') 30 ''' 31 BDUSS:***** 32 ''' 33 # 添加用户cookies信息 34 # name、value必须小写 35 driver.add_cookie({"name": "BDUSS", "value": "用户session字符串"}) 36 37 # 刷新操作 38 driver.refresh() 39 40 time.sleep(10) 41 42 finally: 43 driver.close()
4.爬取京东商品信息:
1 from selenium import webdriver 2 from selenium.webdriver.common.keys import Keys 3 import time 4 5 driver = webdriver.Chrome() 6 7 try: 8 driver.implicitly_wait(10) 9 # 1、往京东主页发送请求 10 driver.get('https://www.jd.com/') 11 12 # 2、输入商品名称,并回车搜索 13 input_tag = driver.find_element_by_id('key') 14 input_tag.send_keys('macbook') 15 input_tag.send_keys(Keys.ENTER) 16 time.sleep(2) 17 18 # 通过JS控制滚轮滑动获取所有商品信息 19 js_code = ''' 20 window.scrollTo(0,5000); 21 ''' 22 driver.execute_script(js_code) # 执行js代码 23 24 # 等待数据加载 25 time.sleep(2) 26 27 # 3、查找所有商品div 28 # good_div = driver.find_element_by_id('J_goodsList') 29 good_list = driver.find_elements_by_class_name('gl-item') 30 n = 1 31 for good in good_list: 32 # 根据属性选择器查找 33 # 商品链接 34 good_url = good.find_element_by_css_selector( 35 '.p-img a').get_attribute('href') 36 37 # 商品名称 38 good_name = good.find_element_by_css_selector( 39 '.p-name em').text.replace("\n", "--") 40 41 # 商品价格 42 good_price = good.find_element_by_class_name( 43 'p-price').text.replace("\n", ":") 44 45 # 评价人数 46 good_commit = good.find_element_by_class_name( 47 'p-commit').text.replace("\n", " ") 48 49 # 商品商家 50 good_from = good.find_element_by_class_name( 51 'J_im_icon').text.replace("\n", " ") 52 53 good_content = f''' 54 商品链接: {good_url} 55 商品名称: {good_name} 56 商品价格: {good_price} 57 评价人数: {good_commit} 58 商品商家: {good_from} 59 \n 60 ''' 61 print(good_content) 62 with open('jd.txt', 'a', encoding='utf-8') as f: 63 f.write(good_content) 64 65 next_tag = driver.find_element_by_link_text('下一页') 66 67 next_tag.click() 68 69 time.sleep(10) 70 71 72 finally: 73 driver.close()
5.作业
1 from selenium import webdriver 2 from selenium.webdriver.common.keys import Keys 3 import time 4 5 def get_good(driver): 6 try: 7 js_code=''' 8 window.scrollTo(0,5000); 9 ''' 10 driver.execute_script(js_code) 11 12 time.sleep(2) 13 good_list = driver.find_elements_by_class_name('gl-item') 14 n = 1 15 for good in good_list: 16 good_url = good.find_element_by_css_selector('.p-img a').get_attribute('href') 17 18 good_name = good.find_element_by_css_selector('.p-name em').text.replace("\n", "--") 19 20 good_price = good.find_element_by_class_name('p-price').text.replace("\n", ":") 21 good_commit = good.find_element_by_class_name('p-commit').text.replace("\n", " ") 22 good_from = good.find_element_by_class_name('J_im_icon').text.replace("\n", " ") 23 24 good_content = f''' 25 商品链接:{good_url} 26 商品名称:{good_name} 27 商品价格:{good_price} 28 评价人数:{good_commit} 29 商品商家:{good_from} 30 \n 31 ''' 32 print(good_content) 33 # with open('jd.txt', 'a', encoding='utf-8')as f: 34 # f.write(good_content) 35 time.sleep(10) 36 37 next_tag=driver.find_element_by_class_name('pn-next') 38 next_tag.click() 39 time.sleep(2) 40 get_good(driver) 41 time.sleep(10) 42 43 finally: 44 driver.close() 45 46 47 if __name__ == '__main__': 48 good_name=input('请输入商品名:').strip() 49 50 driver = webdriver.Chrome() 51 driver.implicitly_wait(10) 52 driver.get("https://www.jd.com/") 53 input_tag = driver.find_element_by_id('key') 54 input_tag.send_keys(good_name) 55 input_tag.send_keys(Keys.ENTER) 56 time.sleep(2) 57 get_good(driver)