python 爬取有道翻译&&防止反爬相关知识&&{“errorCode“:50}

反爬

  1. python访问被拒绝,要使用浏览器访问。
    解决方法:修改表头(加入User-Agent,可以是多个随机)
url = '....'
data = {...}  # 表头的最后一项
head = {}
data = urllib.parse.urlencode(data).encode('utf-8') 
# 浏览器开发者模式中的network中表头(header)的post请求中User-Agent添加到python请求
# 方法一
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'
res = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(res)
# 方法二(在请求之后)
req = urllib.request.Request(url, data)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')
response = urllib.request.urlopen(req)
# 有时还要加入:
# head['Cookie'](如:='OUTFOX_SEARCH_USER_ID=-1764369496@10.108.160.18;')
# head['Referer'](如:='http://fanyi.youdao.com/')
  1. ip访问太频繁(服务器发送验证码拒绝)
    解决方法:
    方法一:使用time模块,time.sleep(n)休眠一段时间 放在循环中模拟真人(不建议,太慢了)
    方法二:使用代理(服务器看到的是代理的IP)(网上搜代理IP)
# 1.proxy_support = urllib.request.ProxyHandler({'类型':'ip:端口号'})
# 2.opener = urllib.request.build_opener(proxy_support)
# 3a.urllib.request.install_opener(opener) 安装opener,一劳永逸
# 3b.opener.open(url)使用特殊opener打开网页
import urllib.request
import random  # 可以多弄几个随机访问
url = 'https://www.csdn.net/'  # 查询IP的网站,用来测试
iplist =['163.125.221.229:8118', '183.166.20.6:9999', '125.108.80.240:9000', '125.108.123.95:9000']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
# 若出现utf-8无法解码的,在后面加上ignore即可(会出现乱码)
html = response.read().decode('utf-8')  # 开发者模式看charset
print(html)

爬取有道翻译(老版url = ‘http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule’)

import urllib.request
import urllib.parse  # 解析
import json
# translate_o?删除了_o 新版本有防爬,具体解决方法参考:https://tendcode.com/article/youdao-spider/
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
content = input('输入翻译内容:')
# 防止反爬 修改headers(表头)方法一
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'
data = {'i': content,
        'from': 'AUTO',
        'to': 'AUTO',
        'smartresult': 'dict',
        'client': 'fanyideskweb',
        'salt': '15972989870051',
        'sign': '26393ce3bf4fc21109f3d258493f0ac1',
        'lts': '1597298987005',
        'bv': 'a612219d8ae465584a02998c7f4cede1',
        'doctype': 'json',
        'version': '2.1',
        'keyfrom': 'fanyi.web',
        'action': 'FY_BY_REALTlME'} 
# 解码,然后将python默认的Unicode编码成浏览器支持的utf-8
data = urllib.parse.urlencode(data).encode('utf-8') 
res = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(res)
# 防止反爬 修改headers(表头)方法二(在请求之后)
'''
req = urllib.request.Request(url, data)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')
response = urllib.request.urlopen(req)
'''
html = response.read().decode('utf-8')  # 反解码
# print(html):'{"type":"EN2ZH_CN","errorCode":0,"elapsedTime":1,
# "translateResult":[[{"src":"i love you","tgt":"我爱你"}]]}'
target = json.loads(html)  # 将json字符串转换位python的字典(去掉引号)
print('翻译结果:%s'%target['translateResult'][0][0]['tgt'])

参考后(https://tendcode.com/article/youdao-spider/):
爬取新版(url = ‘http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule’)

import urllib.request
import hashlib
import time
import json
import random


class Youdao(object):
    def __init__(self, msg):
        self.msg = msg
        self.url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
        self.D = "]BjuETDhU)zqSxf-=B#7m"
        self.salt = self.get_salt()
        self.sign = self.get_sign()

    def get_md(self, value):
        '''md5加密'''
        m = hashlib.md5()
        # m.update(value)
        m.update(value.encode('utf-8'))
        return m.hexdigest()

    def get_salt(self):
        '''根据当前时间戳获取salt参数'''
        s = int(time.time() * 1000) + random.randint(0, 10)
        return str(s)

    def get_sign(self):
        '''使用md5函数和其他参数,得到sign参数'''
        s = "fanyideskweb" + self.msg + self.salt + self.D
        return self.get_md(s)

    def get_result(self):
        headers={}
        headers['Cookie']='OUTFOX_SEARCH_USER_ID=-1764369496@10.108.160.18;'
        headers['Referer']='http://fanyi.youdao.com/'
        headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'
        data = {
            'i': self.msg,
            'from': 'AUTO',
            'to': 'AUTO',
            'smartresult': 'dict',
            'client': 'fanyideskweb',
            'salt': self.salt,
            'sign': self.sign,
            'doctype': 'json',
            'version': '2.1',
            'keyfrom': 'fanyi.web',
            'action': 'FY_BY_CL1CKBUTTON',
            'typoResult': 'true'
        }
        # 解码,然后将python默认的Unicode编码成浏览器支持的utf-8
        data = urllib.parse.urlencode(data).encode('utf-8') 
        res = urllib.request.Request(self.url, data, headers)
        response = urllib.request.urlopen(res)
        # 防止反爬 修改headers(表头)方法二(在请求之后)
        '''
        req = urllib.request.Request(url, data)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')
        response = urllib.request.urlopen(req)
        '''
        html = response.read().decode('utf-8')  # 反解码
        # print(html):'{"type":"EN2ZH_CN","errorCode":0,"elapsedTime":1,
        # "translateResult":[[{"src":"i love you","tgt":"我爱你"}]]}'
        target = json.loads(html)  # 将json字符串转换位python的字典(去掉引号)
        print('翻译结果:%s'%target['translateResult'][0][0]['tgt'])



if __name__ == '__main__':
    content = input('输入翻译内容:')
    y = Youdao(content)
    y.get_result()
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值