1.urllib.request()
1.1版本及使用前提
版本:
python2:urllib2、urllib
python3:urllib.request(将python2中所有版本合并产生)
ps:本文针对于python3
使用前提:
需先导入import urllib.request
1.2基本使用方法
import urllib.request
# 1.urllib.request.urlopen()-->向网页发起请求,获得相应对象
# 2.read()-->读取响应对象内容-->返回bytes的字节流
# 3.decode()-->解码(读取响应对象的内容)-->返回字符串
# 4.getcode()-->获得状态码
# 5.geturl()-->返回实际的url-->防止重定向
url = 'https://blog.csdn.net/weixin_45242171'
response = urllib.request.urlopen(url)
print(response.read().decode('utf-8'))
print(response.getcode()) # 200
print(response.geturl()) # https://blog.csdn.net/weixin_45242171
print(url) # https://blog.csdn.net/weixin_45242171
import urllib.request
# 1.urllib.request.Request(url,headers=headers)-->创建请求对象
# 2.urllib.request.urlopen()-->向网页发起请求,获得相应对象
# 3.read()-->读取响应对象内容-->返回bytes的字节流
# 4.decode()-->解码(读取响应对象的内容)-->返回字符串
url = 'https://www.baidu.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
ps:
两种方式的区别:
urllib.request.urlopen()–>不支持重构user-agent
urllib.request.Request()–>支持重构user-agent
2.urllib.parse()
2.1使用前提
先导入import urllib.parse
2.2基本使用方法
# 1.urllib.parse.urlencode(字典)-->将非字母数字标识符转化为十六进制表示-->3个%表示一个汉字 ps:无urldecode()方法
# 2.urllib.parse.qoute(str)-->同上
import urllib.parse
url = 'https://www.baidu.com/s?wd=%E6%B3%B0%E5%8B%92%E5%85%AC%E5%BC%8F&rsv_spt=1&rsv_iqid=0xdb5579cd0000a95e&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=tb&rsv_sug3=15&rsv_sug1=15&rsv_sug7=101&rsv_sug2=0&rsv_btype=i&inputT=5671&rsv_sug4=10892'
name = {'wd':'泰勒公式'}
print(urllib.parse.urlencode(name)) # wd=%E6%B3%B0%E5%8B%92%E5%85%AC%E5%BC%8F
name1 = '泰勒公式'
print(urllib.parse.quote(name1)) # %E6%B3%B0%E5%8B%92%E5%85%AC%E5%BC%8F
3.练习
# 在csdn中检索出的有关定积分的内容并指定前多少页并把每一页保存在本地
import urllib.request
import urllib.parse
import random
headers_choice = [{'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19','User-Agent1':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'}]
headers = random.choice(headers_choice)
n = int(input('请输入页数n:'))
for i in range(1,n+1):
q = {'q':'定积分'}
name = urllib.parse.urlencode(q)
url = 'https://so.csdn.net/so/search/s.do?p='+str(i)+'&q='+name+'&t=&viparticle=&domain=&o=&s=&u=&l=&f='
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
content = response.read().decode('utf-8')
file_name = '第' + str(i) + '页'
with open(file_name,'w',encoding='utf-8') as f:
print('正在爬取' + file_name)
f.write(content)
# 封装成函数
import urllib.request
import urllib.parse
import random
def readweb(n,headers):
for i in range(1, n + 1):
q = {'q': '定积分'}
name = urllib.parse.urlencode(q)
url = 'https://so.csdn.net/so/search/s.do?p=' + str(i) + '&q=' + name + '&t=&viparticle=&domain=&o=&s=&u=&l=&f='
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
content = response.read().decode('utf-8')
file_name = '第' + str(i) + '页'
writef(file_name,content)
def writef(file_name,content):
with open(file_name, 'w', encoding='utf-8') as f:
print('正在爬取' + file_name)
f.write(content)
def main():
headers_choice = [{'User-Agent': 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19','User-Agent1': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'}]
headers = random.choice(headers_choice)
n = int(input('请输入页数n:'))
readweb(n,headers)
if __name__ == '__main__':
main()
补充
1.encode与decode
encode是将str变成bytes
decode是将bytes变成str