Python3 urllib 笔记

urllib

1. request

request存在于urllib库中,导入方法:
from urllib import request

1.1 urlopen

urlopen的方法以及参数如下:

def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, *, cafile=None, capath=None, cadefault=False, context=None)

用于打开一个url

  • url: 打开的url
  • data: POST的头数据
  • timeout: 超时时间

用法如下:

def demo():
    try:
        s = request.urlopen(_url, timeout=3)
    except request.HTTPError as e:
        print(e)
    finally:
        pass
    msg = s.info()
    data = s.read().decode('utf-8')
    #print_list(dir(s))     # print all functions and proporties
    print(s.status)
    print(msg)

urlopen返回的msg的方法:
- info(): 返回一个HTTPMessage对象
返回response headers信息
- read(size)
- readline()
- readlines()
- close()
- getcode(): 获得应答码

1.2 urlretrieve

用于下载一个远程文件

def urlretrieve(url, filename=None, reporthook=None, data=None)
  • url: 地址
  • filename: 保存文件名
  • reporthook: 下载状态报告
  • data: POST的头数据
  • 返回(filename, HTTPMessage)

reporthook
def progress(blk, blk_size, total_size)
- blk: 传输的块数
- blk_size: 块大小
- total_size: 数据总大小

测试代码

def progress(blk, blk_size, total_size):
# 1.block num
# 2.block size
# 3.data size
    print('%d/%d - %.02f%%' % (blk * blk_size, total_size, (float)(blk_size * blk) * 100 / total_size))

def retrieve():
    request.urlretrieve(_url, 'index.html',reporthook=progress)     # download a url to a file
    #print('fname = ', fname)
    #print_list(msg.items())

1.3 Request

class Request:

    def __init__(self, url, data=None, headers={},
                 origin_req_host=None, unverifiable=False,
                 method=None):

可以使用Request添加或修改http头

def build_opener_demo():
    # POST
    data = {'username': 'jjhfen00', 'password': '962886do621374#'}
    header = {'User-Agent': 'Mozilla/5.0'}
    req = request.Request('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=header)      # important to encode to utf-8
    # req.add_header('', '')

1.4 build_opener

"""
Create an opener object from a list of handlers.

    The opener will use several default handlers, including support
    for HTTP, FTP and when applicable HTTPS.

    If any of the handlers passed as arguments are subclasses of the
    default handlers, the default handlers will not be used."""
def build_opener(*handlers)
  • 参数为Handler列表
    • BaseHandler为父类
    • HTTPHandler
    • HTTPSHandler
    • HTTPCookieProcessor
  • 返回OpenerDirector
def build_opener_demo():
    # POST
    data = {'username': 'jjhfen00', 'password': '962886do621374#'}
    header = {'User-Agent': 'Mozilla/5.0'}
    req = request.Request('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=header)      # important to encode to utf-8
    # req.add_header('', '')
    opener = request.build_opener(request.HTTPHandler(debuglevel=1))        # output debug info
    s = opener.open(req)
    # request.urlopen('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=headers)
    print(s.read(500).decode('utf-8'))
    s.close()

output

<!DOCTYPE HTML>
<html lang="zh-cms-Hans" class="">
<head>
<meta charset="UTF-8">
<meta name="description" content="提供图书、电影、音乐唱片的推荐、评论和价格比较,以及城市独特的文化生活。">
<meta name="keywords" content="豆瓣,广播,登陆豆瓣">
<meta property="qc:admins" content="27514711322166375" />
<meta property="wb:webmaster" content="375d4a17a4fa24c2" />
<meta name="mobile-agent" content="format=xhtml; url=http://m.douban.com">
<title>豆瓣</title>

2. parse

2.1 urlencode

  • 把字典数据转换为url编码
  • 用途
  • 对url参数进行编码
  • 对post上去的form数据进行编码
 def urlencode():
    params = {'score': 100, 'name': 'basic spider', 'comment': 'very good'}
    qs = parse.urlencode(params)        # trans dict to url
    print(qs)
    print(parse.parse_qs(qs))           # trans url to dict

output:

name=basic+spider&score=100&comment=very+good
{'name': ['basic spider'], 'score': ['100'], 'comment': ['very good']}

2.2 parse_qs

  • 把url编码转换为字典数据
def parse_test():
    qs = 'https://www.baidu.com/s?wd=python%E5%AD%A6%E4%B9%A0&rsv_spt=1&rsv_iqid=0x9949316d002fc16e&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=python&inputT=1288&rsv_t=332846lzDwGW5S%2FPcPxNwi%2BGLXjhhSsLTTEoq7CDUafIYdFz%2FW7jKJOOhPpuzg1oFlve&rsv_sug3=14&rsv_sug1=14&rsv_sug7=100&rsv_pq=f4c9c20c002dabfe&bs=python'
    result = parse.parse_qs(qs)
    print_dict(result)

output

('https://www.baidu.com/s?wd', ['python学习'])
('rsv_sug7', ['100'])
('rsv_iqid', ['0x9949316d002fc16e'])
('oq', ['python'])
('rsv_bp', ['1'])
('issp', ['1'])
('rsv_t', ['332846lzDwGW5S/PcPxNwi+GLXjhhSsLTTEoq7CDUafIYdFz/W7jKJOOhPpuzg1oFlve'])
('rsv_idx', ['2'])
('rsv_sug1', ['14'])
('rsv_spt', ['1'])
('rsv_enter', ['1'])
('ie', ['utf-8'])
('rsv_sug3', ['14'])
('inputT', ['1288'])
('tn', ['baiduhome_pg'])
('bs', ['python'])
('rsv_pq', ['f4c9c20c002dabfe'])
('f', ['8'])

需要使用的类:

  • http.cookiejar
    • from http import
  • request.HTTPCookieProcessor

CookieJar:

class CookieJar:
    """Collection of HTTP cookies.

    You may not need to know about this class: try
    urllib.request.build_opener(HTTPCookieProcessor).open(url).
    """
    def __init__(self, policy=None):

提供一个解析并且保存cookie的接口


HTTPCookieProcessor:

class HTTPCookieProcessor(BaseHandler):
    def __init__(self, cookiejar=None):

def cookie_demo():
    cookiejar_ = cookiejar.CookieJar()
    handler = request.HTTPCookieProcessor(cookiejar_)
    opener = request.build_opener(handler, request.HTTPHandler(debuglevel=1))
    s = opener.open('http://www.douban.com')
    #print(s.read(500).decode('utf-8'))
    print_list(s.getheaders())
    s.close()

    print('=' * 70)
    print('cookiejar = ', cookiejar_._cookies)
    print('=' * 70)
    s = opener.open('http://www.douban.com')
    print(s.info()) # == print_dict(s.getheaders())
    s.close()

output

('Server', 'dae')
('Date', 'Sun, 28 Feb 2016 17:55:14 GMT')
('Content-Type', 'text/html; charset=utf-8')
('Content-Length', '94309')
('Connection', 'close')
('X-Douban-Mobileapp', '0')
('Expires', 'Sun, 1 Jan 2006 01:00:00 GMT')
('X-Douban-Newbid', 'Iit6RGmQAE4')
('Pragma', 'no-cache')
('Cache-Control', 'must-revalidate, no-cache, private')
('P3P', 'CP="IDC DSP COR ADM DEVi TAIi PSA PSD IVAi IVDi CONi HIS OUR IND CNT"')
('Set-Cookie', 'bid="Iit6RGmQAE4"; path=/; domain=.douban.com; expires=Mon, 27-Feb-2017 17:55:14 GMT')
('Set-Cookie', 'll="118172"; path=/; domain=.douban.com; expires=Mon, 27-Feb-2017 17:55:14 GMT')
('X-DAE-Node', 'sindar21c')
('X-DAE-App', 'sns')
('Strict-Transport-Security', 'max-age=0;')
======================================================================
cookiejar =  {'.douban.com': {'/': {'ll': Cookie(version=0, name='ll', value='"118172"', port=None, port_specified=False, domain='.douban.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1488218114, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), 'bid': Cookie(version=0, name='bid', value='"Iit6RGmQAE4"', port=None, port_specified=False, domain='.douban.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1488218114, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
======================================================================
Server: dae
Date: Sun, 28 Feb 2016 17:55:14 GMT
Content-Type: text/html; charset=utf-8
Content-Length: 94309
Connection: close
X-Douban-Mobileapp: 0
Expires: Sun, 1 Jan 2006 01:00:00 GMT
Pragma: no-cache
Cache-Control: must-revalidate, no-cache, private
X-DAE-Node: sindar17b
X-DAE-App: sns
Strict-Transport-Security: max-age=0;
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值