import urllib.request
import re
#第一步 确定需要爬取的网址
url ="http://www.baidu.com/"#第二步:发送请求获取响应
response = urllib.request.urlopen(url)#第三步:通过response.read() 获取响应内容
html = response.read().decode("utf-8")#第四步:输出print(html)#提取网址
f = re.findall("""(")(http://[^"]+)(")""",html)for i in f:print(i[1])
二、User-Agent值的获取与爬虫解码
import urllib.request
url ="http://www.baidu.com/"#headers的值可在自己的浏览器中找到,比如在谷歌流量器中按F12,点击Network,在点Name下的任意一栏,在Headers便可看见User-Agent的值
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}#1.构建请求对象
request = urllib.request.Request(url,headers = headers)#2.获取响应对象
response = urllib.request.urlopen(request)#3.通过response获取对象内容
html = response.read().decode("utf-8")print(request.get_header("User-agent"))