python3 urllib.request抓资源的方式

最新推荐文章于 2024-04-23 16:42:17 发布

ReyMiX

最新推荐文章于 2024-04-23 16:42:17 发布

阅读量5.5k

点赞数

Python 3.X 要使用urllib.request 来抓取网络资源。

最简单的方式：

 
        #coding=utf-8 
       
        import  
        urllib.request 
       
        response  
        =  
        urllib.request.urlopen( 
        'http://python.org/' 
        ) 
       
        buff  
        =  
        response.read() 
       
        #显示 
       
        html  
        =  
        buff.decode( 
        "utf8" 
        ) 
       
        response.close() 
       
        print 
        (html)

使用Request的方式：

 
        #coding=utf-8 
       
        import  
        urllib.request 
       
        req  
        =  
        urllib.request.Request( 
        'http://www.voidspace.org.uk' 
        ) 
       
        response  
        =  
        urllib.request.urlopen(req) 
       
        buff  
        =  
        response.read() 
       
        #显示 
       
        the_page  
        =  
        buff.decode( 
        "utf8" 
        ) 
       
        response.close() 
       
        print 
        (the_page)

这种方式同样可以用来处理其他URL，例如FTP：

 
        #coding=utf-8 
       
        import  
        urllib.request 
       
        req  
        =  
        urllib.request.Request( 
        'ftp://ftp.pku.edu.cn/' 
        ) 
       
        response  
        =  
        urllib.request.urlopen(req) 
       
        buff  
        =  
        response.read() 
       
        #显示 
       
        the_page  
        =  
        buff.decode( 
        "utf8" 
        ) 
       
        response.close() 
       
        print 
        (the_page)

使用POST请求：

 
        import  
        urllib.parseimport 
       
        urllib.requesturl  
        =  
        'http://www.someserver.com/cgi-bin/register.cgi' 
       
        values  
        =  
        { 
        'name'  
        :  
        'Michael Foord' 
        , 
       
        'location'  
        :  
        'Northampton' 
        , 
       
        'language'  
        :  
        'Python'  
        } 
       
        data  
        =  
        urllib.parse.urlencode(values) 
       
        req  
        =  
        urllib.request.Request(url, data) 
       
        response  
        =  
        urllib.request.urlopen(req) 
       
        the_page  
        =  
        response.read()

使用GET请求：

 
        import  
        urllib.request 
       
 
        import  
        urllib.parse 
       
 
        data  
        =  
        {} 
       
 
        data[ 
        'name' 
        ]  
        =  
        'Somebody Here' 
       
 
        data[ 
        'location' 
        ]  
        =  
        'Northampton' 
       
 
        data[ 
        'language' 
        ]  
        =  
        'Python' 
       
 
        url_values  
        =  
        urllib.parse.urlencode(data) 
       
 
        print 
        (url_values) 
       
 
        name 
        = 
        Somebody 
        + 
        Here&language 
        = 
        Python&location 
        = 
        Northampton 
       
 
        url  
        =  
        'http://www.example.com/example.cgi' 
       
 
        full_url  
        =  
        url  
        +  
        '?'  
        +  
        url_values 
       
 
        data  
        =  
        urllib.request. 
        open 
        (full_url) 
       

添加header：

 
        import  
        urllib.parse 
       
        import  
        urllib.request 
       
        url  
        =  
        'http://www.someserver.com/cgi-bin/register.cgi' 
       
        user_agent  
        =  
        'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
       
        values  
        =  
        { 
        'name'  
        :  
        'Michael Foord' 
        , 
       
        'location'  
        :  
        'Northampton' 
        , 
       
        'language'  
        :  
        'Python'  
        } 
       
        headers  
        =  
        {  
        'User-Agent'  
        : user_agent } 
       
        data  
        =  
        urllib.parse.urlencode(values) 
       
        req  
        =  
        urllib.request.Request(url, data, headers) 
       
        response  
        =  
        urllib.request.urlopen(req) 
       
        the_page  
        =  
        response.read()

错误处理：

1

2

3

4

 
        req  
        =  
        urllib.request.Request( 
        'http://www.pretend_server.org' 
        ) 
       
        try 
        : urllib.request.urlopen(req) 
       
        except  
        urllib.error.URLError as e: 
       
        print 
        (e.reason)

返回的错误代码：

 
        # Table mapping response codes to messages; entries have the 
       
        # form {code: (shortmessage, longmessage)}. 
       
        responses  
        =  
        { 
       
        100 
        : ( 
        'Continue' 
        ,  
        'Request received, please continue' 
        ), 
       
        101 
        : ( 
        'Switching Protocols' 
        , 
       
        'Switching to new protocol; obey Upgrade header' 
        ), 
       
        200 
        : ( 
        'OK' 
        ,  
        'Request fulfilled, document follows' 
        ), 
       
        201 
        : ( 
        'Created' 
        ,  
        'Document created, URL follows' 
        ), 
       
        202 
        : ( 
        'Accepted' 
        , 
       
        'Request accepted, processing continues off-line' 
        ), 
       
        203 
        : ( 
        'Non-Authoritative Information' 
        ,  
        'Request fulfilled from cache' 
        ), 
       
        204 
        : ( 
        'No Content' 
        ,  
        'Request fulfilled, nothing follows' 
        ), 
       
        205 
        : ( 
        'Reset Content' 
        ,  
        'Clear input form for further input.' 
        ), 
       
        206 
        : ( 
        'Partial Content' 
        ,  
        'Partial content follows.' 
        ), 
       
        300 
        : ( 
        'Multiple Choices' 
        , 
       
        'Object has several resources -- see URI list' 
        ), 
       
        301 
        : ( 
        'Moved Permanently' 
        ,  
        'Object moved permanently -- see URI list' 
        ), 
       
        302 
        : ( 
        'Found' 
        ,  
        'Object moved temporarily -- see URI list' 
        ), 
       
        303 
        : ( 
        'See Other' 
        ,  
        'Object moved -- see Method and URL list' 
        ), 
       
        304 
        : ( 
        'Not Modified' 
        , 
       
        'Document has not changed since given time' 
        ), 
       
        305 
        : ( 
        'Use Proxy' 
        , 
       
        'You must use proxy specified in Location to access this ' 
       
        'resource.' 
        ), 
       
        307 
        : ( 
        'Temporary Redirect' 
        , 
       
        'Object moved temporarily -- see URI list' 
        ), 
       
        400 
        : ( 
        'Bad Request' 
        , 
       
        'Bad request syntax or unsupported method' 
        ), 
       
        401 
        : ( 
        'Unauthorized' 
        , 
       
        'No permission -- see authorization schemes' 
        ), 
       
        402 
        : ( 
        'Payment Required' 
        , 
       
        'No payment -- see charging schemes' 
        ), 
       
        403 
        : ( 
        'Forbidden' 
        , 
       
        'Request forbidden -- authorization will not help' 
        ), 
       
        404 
        : ( 
        'Not Found' 
        ,  
        'Nothing matches the given URI' 
        ), 
       
        405 
        : ( 
        'Method Not Allowed' 
        , 
       
        'Specified method is invalid for this server.' 
        ), 
       
        406 
        : ( 
        'Not Acceptable' 
        ,  
        'URI not available in preferred format.' 
        ), 
       
        407 
        : ( 
        'Proxy Authentication Required' 
        ,  
        'You must authenticate with ' 
       
        'this proxy before proceeding.' 
        ), 
       
        408 
        : ( 
        'Request Timeout' 
        ,  
        'Request timed out; try again later.' 
        ), 
       
        409 
        : ( 
        'Conflict' 
        ,  
        'Request conflict.' 
        ), 
       
        410 
        : ( 
        'Gone' 
        , 
       
        'URI no longer exists and has been permanently removed.' 
        ), 
       
        411 
        : ( 
        'Length Required' 
        ,  
        'Client must specify Content-Length.' 
        ), 
       
        412 
        : ( 
        'Precondition Failed' 
        ,  
        'Precondition in headers is false.' 
        ), 
       
        413 
        : ( 
        'Request Entity Too Large' 
        ,  
        'Entity is too large.' 
        ), 
       
        414 
        : ( 
        'Request-URI Too Long' 
        ,  
        'URI is too long.' 
        ), 
       
        415 
        : ( 
        'Unsupported Media Type' 
        ,  
        'Entity body in unsupported format.' 
        ), 
       
        416 
        : ( 
        'Requested Range Not Satisfiable' 
        , 
       
        'Cannot satisfy request range.' 
        ), 
       
        417 
        : ( 
        'Expectation Failed' 
        , 
       
        'Expect condition could not be satisfied.' 
        ), 
       
        500 
        : ( 
        'Internal Server Error' 
        ,  
        'Server got itself in trouble' 
        ), 
       
        501 
        : ( 
        'Not Implemented' 
        , 
       
        'Server does not support this operation' 
        ), 
       
        502 
        : ( 
        'Bad Gateway' 
        ,  
        'Invalid responses from another server/proxy.' 
        ), 
       
        503 
        : ( 
        'Service Unavailable' 
        , 
       
        'The server cannot process the request due to a high load' 
        ), 
       
        504 
        : ( 
        'Gateway Timeout' 
        , 
       
        'The gateway server did not receive a timely response' 
        ), 
       
        505 
        : ( 
        'HTTP Version Not Supported' 
        ,  
        'Cannot fulfill request.' 
        ), 
       
        }

ReyMiX

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python3 urllib.request抓资源的方式

Python 3.X 要使用urllib.request 来抓取网络资源。最简单的方式：?12345678#coding=utf-8import urllib.requestresponse = urllib.request.urlopen('http://p
复制链接

扫一扫