整理一下之前的学习内容, 以前总是调用request爬取网页内容,不过我们可以使用socket库自己手动实现一个爬虫,最终结果没有进行解析,下一次再进行补充。
#coding:utf-8
import socket
import ssl
#解析请求连接
def parsed_url(url):
#检查协议
protocol = 'http'
if url[:7] == 'http://':
u = url[7:]
elif url[:8] == 'https://':
protocol = 'https'
u = url[8:]
else:
u = url
#检查主机
i = u.find('/')
if i != -1:
host = u[:i]
path = u[i:]
else:
host = u
path = '/'
#检查端口
port_dict = {
'http': 80,
'https': 443,
}
if ':' in host:
host, port = host.split(':', 1)
else:
port = port_dict[protocol]
return protocol, host, port, path
#根据协议采用不同的套接字
def socket_by_protocol(protocol):
'''
根据协议返回一个socket实例
:param protocol:
:return:
'''
if protocol == 'http':
s = socket.socket()
else:
s = ssl.wrap_socket(socket.socket())
return s
#接收返回数据
def response_by_socket(s):
buffer_size = 1024
response = b''
while True:
r = s.recv(buffer_size)
response += r
if len(r) < 1024:
break
return response
#解析返回数据
def parsed_response(r):
headers, body = r.split('\r\n\r\n', 1)
print('body', body.encode('utf-8'))
h = headers.split('\r\n')
status_code = h[0].split()[1]
status_code = int(status_code)
headers = {}
for line in h[1:]:
k, v = line.split(': ')
headers[k] = v
return status_code, headers, body
#获取数据
def get(url):
protocol, host, port, path = parsed_url(url)
s = socket_by_protocol(protocol)
s.connect((host, port))
request = 'GET {} HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n\r\n'.format(path, host)
s.send(request.encode('utf-8'))
response = response_by_socket(s)
r = response.decode('utf-8')
status_code, headers, body = parsed_response(r)
if status_code in [301, 302]:
url = headers['Location']
return get(url)
return status_code, headers, body
def main():
url = 'http://movie.douban.com/top250'
status_code, headers, body = get(url)
print(status_code, headers)
if __name__ == '__main__':
main()