因为网页会存在访问者身份识别(例如禁止同ip连续访问),所有我们在爬取网页的时候有必要做一些“隐藏”。
访问延时
import urllib.request
import urllib.parse
import json
import time
while 1:
content = input('请输入需要翻译的内容(输入q!退出):')
if content == 'q!':
break
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc'
data = {}
data['i'] = content
data['type'] = 'AUTO'
data['doctype'] = 'json'
data['xmlVersion'] = '1.8'
data['keyfrom'] = 'fanyi.web'
data['ue'] = 'utf-8'
data['action'] = 'FY_BY_CLICKBUTTON'
data['typoResult'] = 'true'
data = urllib.parse.urlencode(data).encode('utf-8')
response = urllib.request.urlopen(url,data)
html = response.read().decode('utf-8')
target = json.loads(html)
src = target['translateResult'][0][0]['src']
res = target['translateResult'][0][0]['tgt']
print('需要翻译的内容:'+content )
print('翻译结果:%s' %res)
time.sleep(3)
修改Request Header
import urllib.request
url = 'http://www.baidu.com'
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
使用的User—Agent按下图方式查找:
使用代理proxy
import urllib.request
import random
url = 'http://www.whatismyip.com.tw'
# 代理ip列表,代理ip可在网上搜索
iplist = ['94.231.116.134:8080','183.222.102.105:80','123.84.13.240:8118','115.231.128.79:8080','58.221.38.70:8080']
# 代理ip的使用
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)