利用SCOKET手动爬取豆瓣top250数据

 
整理一下之前的学习内容, 以前总是调用request爬取网页内容,不过我们可以使用socket库自己手动实现一个爬虫,最终结果没有进行解析,下一次再进行补充。
#coding:utf-8
import socket
import ssl

#解析请求连接
def parsed_url(url):
    #检查协议
    protocol = 'http'
    if url[:7] == 'http://':
        u = url[7:]
    elif url[:8] == 'https://':
        protocol = 'https'
        u = url[8:]
    else:
        u = url

    #检查主机
    i = u.find('/')
    if i != -1:
        host = u[:i]
        path = u[i:]
    else:
        host = u
        path = '/'

    #检查端口
    port_dict = {
        'http': 80,
        'https': 443,
    }

    if ':' in host:
        host, port = host.split(':', 1)
    else:
        port = port_dict[protocol]

    return protocol, host, port, path

#根据协议采用不同的套接字
def socket_by_protocol(protocol):
    '''
    根据协议返回一个socket实例
    :param protocol:
    :return:
    '''
    if protocol == 'http':
        s = socket.socket()
    else:
        s = ssl.wrap_socket(socket.socket())

    return s

#接收返回数据
def response_by_socket(s):
    buffer_size = 1024
    response = b''
    while True:
        r = s.recv(buffer_size)
        response += r
        if len(r) < 1024:
            break
    return response


#解析返回数据
def parsed_response(r):
    headers, body = r.split('\r\n\r\n', 1)
    print('body', body.encode('utf-8'))
    h = headers.split('\r\n')
    status_code = h[0].split()[1]
    status_code = int(status_code)

    headers = {}
    for line in h[1:]:
        k, v = line.split(': ')
        headers[k] = v

    return status_code, headers, body

#获取数据
def get(url):
    protocol, host, port, path = parsed_url(url)
    s = socket_by_protocol(protocol)
    s.connect((host, port))

    request = 'GET {} HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n\r\n'.format(path, host)
    s.send(request.encode('utf-8'))
    response = response_by_socket(s)

    r = response.decode('utf-8')
    status_code, headers, body = parsed_response(r)

    if status_code in [301, 302]:
        url = headers['Location']
        return get(url)

    return status_code, headers, body

def main():
    url = 'http://movie.douban.com/top250'
    status_code, headers, body = get(url)
    print(status_code, headers)


if __name__ == '__main__':
    main()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值