异常
try:
fp = open('box.txt','r') #可能出现异常的代码
fp.read()
except FileNotFoundError: #异常的类型
print('系统正在升级') #提示
urllib
# 使用urllib获取百度的源码
import urllib.request
# 1.定义一个需要访问的url
url = "http://www.baidu.com"
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(url)
# 获取响应中的页面的源码
# read方法 返回的是二进制的数据 (b 就是字节数据->二进制数据
# 将二进制的数据转化为字符串数据
# 二进制->字符串 解码 decode('编码的格式')
content =response.read().decode('utf-8')
print(content)
urllib的一个类型和6个方法
import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
print(type(response))
# response 是HTTPResponse的类型
# content = response.read().decode('utf-8')
# 按照一个一个字节的去读取代码
# content = response.readline()
# 只能读取一行代码
content = response.readlines()
# 读取全部
# print(content)
print(response.getcode())
# 如果状态码为200的证明代码没问题
print(response.geturl())
# 访问的请求地址
print(response.getheaders())
# 获取请求头的信息
urllib下载
import urllib.request
# 下载网页
# url = 'http://www.baidu.com'
# 参数是第一个是下载的路径 ,第二个是filenname的文件的名字
# urllib.request.urlretrieve(url, 'baidu.html')
# 下载图片
# url_image = "https://tse2-mm.cn.bing.net/th/id/OIP-C.E_pSoQlbS7P0vyYzf6sA4wHaNL?w=198&h=329&c=7&r=0&o=5&pid=1.7"
# urllib.request.urlretrieve(url_image, 'lisa.jpg')
# 下载视频
url_video = "https://apd-ugcvlive.apdcdn.tc.qq.com/om.tc.qq.com/At_jKGHyeSFe6p6KZJBTOvN6rl8uI8bQvUdlIaemzFUU/B_JxNyiJmktHRgresXhfyMeq521iaWBXTyVwX9cKvEudUj65dbs468rzACrPlcS7TG/svp_50001/szg_5892_50001_0bf23uabyaaas4ajcktckvqvdxoddtoqahca.f30.mp4?sdtfrom=v1010&guid=1adac7fbebe2cb99&vkey=C8C41D3C7B03DCFB56653F619FB811CC3E18949AD723B8688CA79454609D497024698EE196154FF0DC036F2039649F00B48B017E4C61D63AE559C4C4D28EBBE0977F97E79EF032238C474BB7453F871041C539AFCE800E62033512293D966D050AFB05909D84BC22244BEB2A19C39428B7F7063EAFC7C9E914680912C37479056FD1F990DFB3C53C559DAA9B7FE8495E93B405CCC00B5951A9F9112B4265F7EDCB485641F634E69A"
urllib.request.urlretrieve(url_video,'video1.mp4')
urllib请求对象的定制
import urllib.request
url = 'https://www.baidu.com'
# http/https www.baidu.com 80/443
# 协议 主机 端口号
# http 80
# https 443
# 请求的定制
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
# 将请求的头部放在request中,得注意参数顺序的问题,所以不能直接写headers , 需要关键字传参
request = urllib.request.Request(url, headers=headers) # 请求对象的定制
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
urllib中get请求的quote方法
import urllib.request
import urllib.parse
# 周杰伦网页的源码
url = 'https://www.baidu.com/s?wd='
# url_name = 'https://www.baidu.com/s?wd=周杰伦'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
# 将周杰伦三个字变成unicode编码格式 , 需要通过urllib.parse
name = urllib.parse.quote("周杰伦")
url = url + name
# print(url)
# 下载这个页面看看
urllib.request.urlretrieve(url,'jielun.html')
request = urllib.request.Request(url, headers=headers) # 请求对象的定制
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
urllib爬取豆瓣电影的数据
# 导包操作
import urllib.request
# 请求的地址
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0"
# 伪装自己
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
# 定制请求对象
request = urllib.request.Request(url, headers=headers)
# 请求的地址
response = urllib.request.urlopen(request)
# 请求返回的结果
result = response.read().decode('utf-8')
# 创建一个json文件
content = open('douban.json','w',encoding='utf-8')
# 往json文件中插入返回的数据
content.write(result)
# 关闭
content.close()
# print(result)
urllib请求肯德基前10页的数据
import urllib.request
import urllib.parse
# 请求对象的定制
def getrequest(page):
url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
data = {
'cname': '六安',
'pid': '',
'pageIndex': page,
'pageSize': 10
}
data = urllib.parse.urlencode(data).encode('utf-8')
req_url = urllib.request.Request(url=url, data=data, headers=headers)
return req_url
# 发送请求
def getContent(url):
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
return content
# 下载
def dowmloadMessage(page, content):
res = open('douban_' + str(page) + '.json', 'w', encoding='utf-8')
res.write(content)
if __name__ == '__main__':
start_page = int(input("请输入起始页码")) # 输入开始的页码
end_page = int(input("请输入结束页码")) # 输入结束的页码
for page in range(start_page, end_page + 1):
# 请求对象的定制
request_url = getrequest(page)
# 发送请求
content = getContent(request_url)
# 下载
dowmloadMessage(page, content)
urllib爬虫异常
import urllib.request
import urllib.error
# url = 'https://blog.csdn.net/GAGGAAAAA/article/details/139061046'
url = 'http://www.baidu.com666'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
try:
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
print(response)
except urllib.error.HTTPError: # 捕获异常
print('HTTPError,哈哈')
except urllib.error.URLError:
print('URLError 66 ')