urllib基本使用
##########################################################################
简单爬取网页信息
import urllb.request
#要爬取的网页
url = 'http://www.baidu.com'
#获取网页数据
response = urllib.request.urlopen()
#保存网页数据
content = response.read()
#打印数据
print(content)
#####################
一种类型,六个方法
print(type(response))
一种类型HTTPResponse
六种方法
read() readline() readlines()
#返回200,说明代码逻辑无错误
print(response.getcode())
#返回网址
print(response.geturl)
#返回响应信息
print(response.getheaders())
#############################################################################
下载网页,图片,视频
#下载网页 # url_page = 'http://www.baidu.com' # urllib.request.urlretrieve(url_page,'baidu.html') #下载图片 # url_img = 'https://tse1-mm.cn.bing.net/th/id/OIP-C.Zte3ljd4g6kqrWWyg-8fhAHaEo?w=305&h=190&c=7&r=0&o=5&dpr=1.3&pid=1.7' # urllib.request.urlretrieve(url_img,'cat.jpg') #下载视频 url_video = 'https://www.bilibili.com/video/BV11V411G7qc/?spm_id_from=333.1007.tianma.4-2-13.click' urllib.request.urlretrieve(url_video,'bli.mp4')
#############################################################################f反爬 UA
User-Agent :
字典 Request
import urllib.request url = 'https://baidu.com' #请求对象的定制 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31' } #关键字传参 request = urllib.request.Request(url = url,headers = headers) response = urllib.request.urlopen(request) #readlines().decode('utf-8')报错,不知道是什么原因 content = response.read().decode('utf-8') print(content)