爬虫 Cookie 学习

最新推荐文章于 2023-03-31 18:06:37 发布

sinat_34346715

最新推荐文章于 2023-03-31 18:06:37 发布

阅读量330

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/sinat_34346715/article/details/50959204

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

个人学习过程中混乱的思路记录，没什么参考价值。。

先是从网上看的处理cookie的方法,代码中的四步

#\user\bin\env python
#coding=utf-8
import gzip
import re
import urllib.request
import http.cookiejar

def unzip(data):
    try:
        new_data=gzip.decompress(data) #如果是data =..与定义一个新变量在内存的变化上有什么区别
    except:
        return data 
    return new_data

def cookieTest():
    url="http://www.zhihu.com/"
    headers={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; rv:45.0) Gecko/20100101 Firefox/45.0",
    "Host":"www.zhihu.com"
    }
    req=urllib.request.Request(url,headers=headers)
    cj=http.cookiejar.CookieJar()#(1）
    cookieHandler=urllib.request.HTTPCookieProcessor(cj)#(2）
    opener=urllib.request.build_opener(cookieHandler)#(3）
    conn=opener.open(req)#(4)
    data=unzip(conn.read()).decode("utf-8")
    print("ok")
    print(cj)

if __name__ == '__main__':
    cookieTest()

打印结果为：

ok
<CookieJar[<Cookie cap_id="MjA4ODRkMTYxMDVlNDhhN2FhMDcyOTEyNjllMTMzMWU=|1458650717|cd52a12c1949177603c44283171906fbc1859ec8" for .zhihu.com/>, <Cookie n_c=1 for .zhihu.com/>, <Cookie q_c1=9d4b3e03bc4c4040b567cf22343dfc9f|1458650717000|1458650717000 for .zhihu.com/>, <Cookie _xsrf=ca6d5b9fe1ace33857f7549851058300 for www.zhihu.com/>]>
[Finished in 1.5s]

抓下了 response 的所有cookie 部分。且保存在cookie中
不知道其中的原理。HTTPCookieProcessor()的源码如下：

class HTTPCookieProcessor(BaseHandler):
    def __init__(self, cookiejar=None):
        import http.cookiejar
        if cookiejar is None:
            cookiejar = http.cookiejar.CookieJar()
        self.cookiejar = cookiejar

    def http_request(self, request):
        self.cookiejar.add_cookie_header(request)
        return request

    def http_response(self, request, response):
        self.cookiejar.extract_cookies(response, request)
        return response

    https_request = http_request
    https_response = http_response

从构造函数中看到如果cookiejar为空，它也会自己建一个，所以觉得注释掉cj=http.cookiejar.CookieJar()这行应该不影响，下面的代码证明确实是这样。

#\user\bin\env python
#coding=utf-8
import gzip
import re
import urllib.request
import http.cookiejar

def unzip(data):
    try:
        new_data=gzip.decompress(data) #如果是data =..与定义一个新变量在内存的变化上有什么区别
    except:
        return data 
    return new_data

def cookieTest():
    url="http://www.zhihu.com/"
    headers={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; rv:45.0) Gecko/20100101 Firefox/45.0",
    "Host":"www.zhihu.com"
    }
    req=urllib.request.Request(url,headers=headers)
    #cj=http.cookiejar.CookieJar()
    #cookieHandler=urllib.request.HTTPCookieProcessor(cj)
    cookieHandler=urllib.request.HTTPCookieProcessor()
    opener=urllib.request.build_opener(cookieHandler)
    conn=opener.open(req)
    data=unzip(conn.read()).decode("utf-8")
    print("ok")
    #print(cj)
    print(cookieHandler)

if __name__ == '__main__':
    cookieTest()

打印结果为

ok
<urllib.request.HTTPCookieProcessor object at 0x0231B450>
[Finished in 0.4s]

首先程序是能运行的，那cookie到底被保存下来了吗？如何检验，再次登陆查看。保存在哪里？应该是系统自建的cookiejar中（源码）。

#\user\bin\env python
#coding=utf-8
import gzip
import re
import urllib.request
import http.cookiejar

def unzip(data):
    try:
        new_data=gzip.decompress(data) #如果是data =..与定义一个新变量在内存的变化上有什么区别
    except:
        return data 
    return new_data

def cookieTest():
    url="http://www.zhihu.com/"
    headers={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; rv:45.0) Gecko/20100101 Firefox/45.0",
    "Host":"www.zhihu.com"
    }
    req=urllib.request.Request(url,headers=headers)
    # cj=http.cookiejar.CookieJar()
    # cookieHandler=urllib.request.HTTPCookieProcessor(cj)
    cookieHandler=urllib.request.HTTPCookieProcessor()
    opener=urllib.request.build_opener(cookieHandler)
    conn=opener.open(req)
    data=unzip(conn.read()).decode("utf-8")
    print("ok")
    print(cookieHandler)
    conn=opener.open(req)# 再次打开这个URL
    print("ok")
    print(cookieHandler)

if __name__ == '__main__':
    cookieTest()

用Fiddler观察，第二次打开url时，从第一次的response中获得的cookie已经被加入到了header中。Urllib.request.urlopen()是没有这个功能的。
那么不解压和read（）和编码，可以获得cookie吗？注释掉读取数据的那行。依然可以。opener 调用handler全自动处理的感觉。。

#\user\bin\env python
#coding=utf-8
import gzip
import re
import urllib.request
import http.cookiejar

def unzip(data):
    try:
        new_data=gzip.decompress(data) #如果是data =..与定义一个新变量在内存的变化上有什么区别
    except:
        return data 
    return new_data

def cookieTest():
    url="http://www.zhihu.com/"
    headers={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; rv:45.0) Gecko/20100101 Firefox/45.0",
    "Host":"www.zhihu.com"
    }
    req=urllib.request.Request(url,headers=headers)
    cj=http.cookiejar.CookieJar()
    cookieHandler=urllib.request.HTTPCookieProcessor(cj)
    cookieHandler=urllib.request.HTTPCookieProcessor()
    opener=urllib.request.build_opener(cookieHandler)
    conn=opener.open(req)
    #data=unzip(conn.read()).decode("utf-8")
     不读取数据
    print("ok")
    print(cookieHandler)
    conn=opener.open(req)
    print("ok")
    print(cookieHandler)

if __name__ == '__main__':
    cookieTest()