代码实现:
import socket
url = 'www.baidu.com'
html = ''
sendHead = 'GET / HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n' % url
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((url, 80))
# 发送http报文
sock.send(sendHead.encode())
# 循环接收返回数据
while True:
data = sock.recv(65536)
if data:
try:
data = data.decode('UTF-8')
html += data
except:
pass
continue
break
# 分离报文头部和报文主体
html = html.split('\r\n\r\n')
# 取出报文主体
print(html[-1])
import re
# 取出链接
links = re.findall(r'<a href="(.*?)".*?>.*?</a>', html[-1], re.I|re.M|re.S)
for link in links:
print(link)
结果:
<html>
...
</html>
//www.baidu.com/gaoji/preferences.html
https://passport.baidu.com/v2/?reg&