参考:https://blog.csdn.net/c406495762/article/details/69817490
1、有些网站需要登录后才能访问某个页面,在登录之前,你想抓取某个页面内容,登陆前与登陆后是不同的,或者不允许的。
使用Cookie和使用代理IP一样,也需要创建一个自己的opener。在HTTP包中,提供了cookiejar模块,用于提供对Cookie的支持。
2 将Cookie保存到变量中
def getCookeis():
#declare a cookiejar object instance to save cookie
cookie=cookiejar.CookieJar();
#create cookie processor by HTTPCookieProcessor object of urllib.request lib
handler=request.HTTPCookieProcessor(cookie);
#create opener
opener=request.build_opener(handler);
#open web page
response=opener.open('http://www.baidu.com')
for item in cookie:
print('Name=%s' %item.name)
print('Value=%s' %item.value)
3、保存Cookie到文件
def getCookiesToTxt():
filename = '..\\file\cookie.txt'
cookie = cookiejar.MozillaCookieJar(filename)
handler=request.HTTPCookieProcessor(cookie)
opener = request.build_opener(handler)
response = opener.open('http://www.baidu.com')
#ignore_discard:even if the cookie will be discared, it will be preserved
#ignore_expires:if cookies exits in the file,overwrite the original file
cookie.save(ignore_discard=True, ignore_expires=True)
4、从文件中获取Cookie并访问
def getCookiesFromTxt():
filename = '..\\file\cookie.txt'
#create MozillaCookieJar instance object
cookie = cookiejar.MozillaCookieJar()
#get cookie from the file to variables
cookie.load(filename, ignore_discard=True, ignore_expires=True)
handler=request.HTTPCookieProcessor(cookie)
opener = request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
5、利用cookies模拟登录,这里选择的是http://www.biqukan.comhttps网站总是不成功,这个网站目前是http的
def simulateLogin():
#login url
login_url = 'http://www.biqukan.com/user/login.php?action=login&usecookie=1&url=http://www.biqukan.com/'
#User-Agent infromation
user_agent = r'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
#Headers information
head = {'User-Agnet': user_agent, 'Connection': 'keep-alive'}
#login Form_Data information
Login_Data = {}
Login_Data['password'] = 'yourpassword'
Login_Data['username'] = 'yourname'
#convert to standard format
logingpostdata = parse.urlencode(Login_Data).encode('utf-8')
cookie = cookiejar.CookieJar()
cookie_support = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_support)
req1 = request.Request(url=login_url, data=logingpostdata, headers=head)
data_url = 'http://www.biqukan.com/user/bookcase.php'
req2 = request.Request(url=data_url, headers=head)
try:
response1 = opener.open(req1)
response2 = opener.open(req2)
html = response2.read().decode('gbk')
print (html)
except error.URLError as e:
if hasattr(e, 'code'):
print("HTTPError:%d" % e.code)
elif hasattr(e, 'reason'):
print("URLError:%s" % e.reason)