Python如何访问互联网
URL+lib
URL:网页地址
一般格式:protocol://hostname[:port]/path/[;parameters][?query]#fragment
URL由三部分组成:
第一部分:协议(http,https,ftp,file...)
第二部分:服务器域名或IP地址有时候要包含端口号(http默认端口号为80)
第三部分:资源的具体地址
import urllib.request
response = urllib.request.urlopen('https://baidu.com')
html = response.read()
html = html.decode('utf-8')
print(html)
案例:下载图片
import urllib.request
# response = urllib.request.urlopen('http://placekitten.com/500/600')
req = urllib.request.Request('http://placekitten.com/500/600')
response = urllib.request.urlopen(req)
# 得到访问地址
print(response.geturl())
# 得到远程服务器返回的头部信息
print(response.info())
cat_img = response.read()
# 返回http的状态码
print(response.getcode())
with open('cat_500_600.jpg','wb') as f:
f.write(cat_img)
案例:利用有道翻译翻译文本
import urllib.request,urllib.parse
url = 'https://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {}
data['i'] = '我爱python'
data['from']= 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt']= '16612145938029'
data['sign']= '75c46de318eacc916892d9fdd9a859dc'
data['lts']= '1661214593802'
data['bv']= '50b61ff102560ebc7bb0148b22d7715c'
data['doctype']= 'json'
data['version']= '2.1'
data['keyfrom']= 'fanyi.web'
data['action']='FY_BY_CLICKBUTTION'
data = urllib.parse.urlencode(data).encode('utf-8')
response = urllib.request.urlopen(url,data)
print(response.read().decode('utf-8'))
修改headers用于隐藏
第一种方法在Request对象生成之前,通过修改headers参数修改:
import urllib.request,urllib.parse
import requests
url = 'https://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
data = {}
data['i'] = '我爱python'
data['from']= 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt']= '16612145938029'
data['sign']= '75c46de318eacc916892d9fdd9a859dc'
data['lts']= '1661214593802'
data['bv']= '50b61ff102560ebc7bb0148b22d7715c'
data['doctype']= 'json'
data['version']= '2.1'
data['keyfrom']= 'fanyi.web'
data['action']='FY_BY_CLICKBUTTION'
data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
print(req.headers)
第二种方法通过Request.add_header()方法修改:
import urllib.request,urllib.parse
import requests
url = 'https://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
# head = {}
# head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
data = {}
data['i'] = '我爱python'
data['from']= 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt']= '16612145938029'
data['sign']= '75c46de318eacc916892d9fdd9a859dc'
data['lts']= '1661214593802'
data['bv']= '50b61ff102560ebc7bb0148b22d7715c'
data['doctype']= 'json'
data['version']= '2.1'
data['keyfrom']= 'fanyi.web'
data['action']='FY_BY_CLICKBUTTION'
data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url, data)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
print(req.headers)
代理
首先需要设置一个字典
定制、创建opener(私人定制)
安装opener或调用opener
import urllib.request
url = 'http://www.whatismyip.com.tw'
# 创建字典
proxy_support = urllib.request.ProxyHandler({'http':'111.3.118.247:30001'})
# 定制、创建opener
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36')]
# 安装opener
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)