selenium的使用
-
selenium的基础
from selenium import webdriver from selenium.webdriver.common.keys import Keys import time # 1.创建浏览器对象 # 注意:如果浏览器对象是局部变量,函数结束会自动关闭,如果是全局变量,需要手动关 b = webdriver.Chrome() # 2.打开网页 b.get('https://www.jd.com') # 3.获取网页内容 # 注意:在获取浏览器page_source值的时候,只能获取当前浏览器已经加载出来的数据 # print(b.page_source) # 4.获取和操作标签 # 1)输入框操作—>获取输入框—>输入内容—>按回车 # 根据id值获取输入框 # b.find_elements() # 通过css选择器选中标签 input = b.find_element_by_css_selector("#key") # 在输入框中输入电脑 input.send_keys('电脑') # 在输入框中按回车键 input.send_keys(Keys.ENTER) time.sleep(1) # 重新拿新页面输入框,再次输入 input2 = b.find_element_by_css_selector("#key") # 清空原来输入框默认内容 input2.clear() # 输入新内容 input2.send_keys('手机') input2.send_keys(Keys.ENTER) # 5.回退(浏览器.back()) time.sleep(1) b.back() time.sleep(1) b.back() # 6.前进(浏览器.forward()) time.sleep(1) b.forward() time.sleep(1) b.forward() # 获取按钮标签 search_btn = b.find_element_by_css_selector(".button.cw_icon") # 点击按钮 search_btn.click() # 关闭浏览器 # b.close()
-
selenium选项卡
from selenium import webdriver import time b = webdriver.Chrome() b.get("https://www.jd.com") # 获取秒杀的a标签 miaosha = b.find_element_by_css_selector("#navitems-group1>li>a") miaosha.click() # 获取所有选项卡 print(b.window_handles) time.sleep(2) # 切换选项卡 b.switch_to.window(b.window_handles[0]) time.sleep(2) b.switch_to.window(b.window_handles[1])
-
selenium获取网页cookies
from selenium import webdriver from selenium.webdriver.common.keys import Keys import time url = "https://www.taobao.com" # 创建浏览器对象 b = webdriver.Chrome() b.get(url) # 获取输入框,输入电脑并点击跳转 input = b.find_element_by_css_selector("#q") input.send_keys("电脑") input.send_keys(Keys.ENTER) # 进入登录页面,等待人工操作登陆页面 time.sleep(40) # 人工登录过后获取cookie值,保存到本地文件中 cookies = b.get_cookies() # print(cookies) with open('./files/taobao_cookies.txt', 'w', encoding='utf-8') as f: f.write(str(cookies))
-
selenium使用cookies
from selenium import webdriver import time from selenium.webdriver.common.keys import Keys # 创建浏览器 b = webdriver.Chrome() b.get('https://www.taobao.com') # 设置cookies with open(r'./files/taobao_cookies.txt', 'r', encoding='utf-8') as f: py_obj = eval(f.read()) for obj in py_obj: if obj['secure']: b.add_cookie(obj)
-
设置页面滚动,爬取京东数据
def get_net_data(): global b b = webdriver.Chrome() data = [] j = 0 for i in range(1, 200, 2): if i == 1: b.get('https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&pvid=2d0c277e2ea34ae8bb95598ad3a04f48&page=1&s=1&click=0') elif i == 3: b.get('https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&pvid=2d0c277e2ea34ae8bb95598ad3a04f48&page=3&s=56&click=0') else: j += 1 b.get(f'https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&pvid=2d0c277e2ea34ae8bb95598ad3a04f48&page={i}&s={60*j+56}&click=0') time.sleep(0.5) # 提前设置滚动距离 max_height = 8000 # 每次滚动的位置 y = 0 while True: y += 500 b.execute_script(f'window.scrollTo(0, {y})') if y > max_height: break time.sleep(0.2) data.append(b.page_source) return data def analysis_data(data): all_info = [] for data_item in data: soup = BeautifulSoup(data_item, 'lxml') li_list = soup.select('#J_goodsList>ul>li') all_page_info = [] for li in li_list: brand = (li.select_one('.p-name.p-name-type-2>a>em').get_text()).split(' ')[0] picture = 'https:' + str(li.select_one('img').attrs.get('src')) # str link = 'https:' + li.select_one('a').attrs['href'] # str price = str(li.select_one('i').string) evalation1 = li.select_one('strong>a').get_text() # str # storage = li.select_one('.curr-shop.hd-shopname').get_text() # str store_link = 'https:' + str(li.select_one('.curr-shop.hd-shopname').attrs['href']) # str # label = li.select_one('.goods-icons.J-picon-tips.J-picon-fix') all_page_info.append([brand, picture, link, price, evalation1, store_link]) all_info.append(all_page_info) return all_info if __name__ == '__main__': # analysis_data(get_net_data()) all_info = analysis_data(get_net_data()) with open('./files/京东电脑所有信息.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['电脑品牌', '图片地址', '电脑链接', '电脑价格', '评价人数', '店铺链接']) for item in all_info: writer.writerows(item)