用google浏览器模拟登录获取cookie并爬取数据
本人新手入门python,希望能共同进步
转载请注明地址http://mp.blog.csdn.net/postedit/79423191
模拟google登录是从 https://www.cnblogs.com/bethansy/p/7683130.html 这个博客上找到的,然而一直模拟登录爬取数据比较慢的,所以我想能不能获取cookie,然后直接带入cookie参数来用python爬取
直接上代码了
#coding:utf-8 #/python3.5 __author__ = 'heart_eagle' #第一步 获取cookie url = 'https://www.tianyancha.com/login' from selenium import webdriver import os import requests import time from bs4 import BeautifulSoup chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application/chromedriver.exe' os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver) driver.get(url) # 模拟登陆 driver.find_element_by_xpath( ".//*[@id='web-content']/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/input"). \ send_keys('***********') #账号 driver.find_element_by_xpath( ".//*[@id='web-content']/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[3]/input"). \ send_keys('***********') #密码 driver.find_element_by_xpath( ".//*[@id='web-content']/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[5]").click() time.sleep(3) driver.refresh() #获取cookies xx = driver.get_cookies() print(xx) names = [] values = [] #获取cookies里对应name,和value,并形成dict for x in xx: print('name:',x['name']) print('value',x['value']) names.append(x['name']) values.append(x['value']) #names,values为获取的列表 #形成dict cookies = dict(zip(names,values)) headers = { 'Host':'www.tianyancha.com', 'Connection':'keep-alive', 'Cache-Control':'max-age=0', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer':'https://www.tianyancha.com/', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.9' } print(cookies)
#第二步代入参数cookie等
url = 'https://www.tianyancha.com/search' #网址参数 params = { 'key':'肯德基', 'checkFrom:':'searchBox' } #get请求代入参数params,cookies,headers response = requests.get(url,params = params,headers = headers,cookies = cookies, timeout = 60) response.encoding = 'utf-8' res = response.text soup = BeautifulSoup(res,'html.parser') #得到我们想要的网页 print(soup) #....然后可以根据标签或者正则获取对应我们想要抓取的东西了,这里就不赘述了
这里是打印出来的cookies(最后一行)
这里是打印出来的soup一部分