#coding: utf-8 import socket import ssl def parsed_url(url): """ 解析 url 返回 (protocol host port path) """ # 判断协议是 http 还是 https protocol = 'http' if url[:7] == 'http://': u = url.split('://')[1] elif url[:8] == 'https://': u = url.split('://')[1] protocol = 'https' else: u = url # 检查默认 path i = u.find('/') if i == -1: host = u path = '/' else: host = u[:i] path = u[i:] # 不同协议对应不同的默认端口 port_dict = { 'http': 80, 'https': 443, } port = port_dict[protocol] # 不为默认端口的情况 if ':' in host: h = host.split(':') host = h[0] port = int(h[1]) # 输出结果,协议,主机,端口,路径 return protocol, host, port, path def socket_by_protocol(protocol): """ 根据协议返回一个 socket 实例 """ if protocol == 'http': s = socket.socket() else: s = ssl.wrap_socket(socket.socket()) return s def response_by_socket(s): """ 参数是一个 socket 实例 返回这个 socket 读取的所有数据 """ response = b'' buffer_size = 1024 # 循环保证数据全部接收 while True: r = s.recv(buffer_size) if len(r) == 0: break response += r return response def parsed_response(r): """ 把 response 解析出 状态码 headers(为字典) body 返回 """ header, body = r.split('\r\n\r\n', 1) h = header.split('\r\n') status_code = h[0].split()[1] status_code = int(status_code) header = {} for line in h[1:]: k, v = line.split(': ') # header 内部有一些状态信息 header[k] = v return status_code, header, body def get(url): """ 用 GET 请求 url 并返回响应 """ protocol, host, port, path = parsed_url(url) print('get {} {} {} {}'.format(protocol, host, port, path)) s = socket_by_protocol(protocol) s.connect((host, port)) request = 'GET {} HTTP/1.1\r\nHost: {}\r\nCookie:user=assafasdfsadf\r\nConnection:close\r\n\r\n'.format(path, host) encoding = 'utf-8' s.send(request.encode(encoding)) response = response_by_socket(s) print('get response, ', response) r = response.decode(encoding) status_code, headers, body = parsed_response(r) if status_code in [301, 302]: url = headers['Location'] return get(url) return status_code, headers, body def main(): """ 运行主函数 """ url = 'http://movie.douban.com/top250' status_code, headers, body = get(url) print('main status_code', status_code) print('main headers ({})'.format(headers)) print('main body', body) # 以下为测试函数 def test_parsed_url(): """ 测试函数检验 parsed_url 函数是否正确 """ http = 'http' https = 'https' host = 'g.cn' path = '/' test_items = [ ('http://g.cn', (http, host, 80, path)), ('http://g.cn/', (http, host, 80, path)), ('http://g.cn:90', (http, host, 90, path)), ('http://g.cn:90/', (http, host, 90, path)), ('https://g.cn', (https, host, 443, path)), ('https://g.cn:233/', (https, host, 233, path)), ] for t in test_items: url, expected = t u = parsed_url(url) e = "parsed_url ERROR, ({}) ({}) ({})".format(url, u, expected) assert u == expected, e def test_parsed_response(): """ 测试是否能正确解析响应 """ # \ 表示连接多行字符串 response = 'HTTP/1.1 301 Moved Permanently\r\n' \ 'Content-Type: text/html\r\n' \ 'Location: https://movie.douban.com/top250\r\n' \ 'Content-Length: 178\r\n\r\n' \ 'test body' status_code, header, body = parsed_response(response) assert status_code == 301 assert len(list(header.keys())) == 3 assert body == 'test body' def test_get(): """ 测试是否能正确处理 HTTP 和 HTTPS """ urls = [ 'http://movie.douban.com/top250', 'https://movie.douban.com/top250', ] # 这里就直接调用了 get 如果出错就会挂, 测试得比较简单 for u in urls: get(u) def test(): """ 用于测试的主函数 """ test_parsed_url() test_get() test_parsed_response() if __name__ == '__main__': test() main()
以上程序中所用到的知识点如下:
- https 知识点
# https 请求的默认端口是 443
# https 的 socket 连接需要 import ssl
# 使用 s = ssl.wrap_socket(socket.socket()) 来初始化
- http 和 https 的区别
# 默认端口的区别
# http 的默认端口为 80
# https 的默认端口为 443
# 创建 socket 实例的区别
# http : s = socket.socket()
# https : s = ssl.wrap_socket(socket.socket())
- assert 语句
用法:assert 布尔表达式, 异常字符串
# 如果断言成功,布尔表达式为真,则通过测试
# 布尔表达式为假,中断程序,并输出异常字符串
- HTTP 重定向状态码
# 301, 302 都为重定向状态码
# 重定向状态会在 HTTP 头的 Location 部分告诉你应该转向的 URL
# 即 若遇到重定向状态码,就请求新地址并且返回