爬虫小案例:百度翻译

baidufanyi1.py
import urllib.request
import urllib.parse

url = 'http://fanyi.baidu.com/#en/zh/'
word = input('请输入要查询的单词')
url = url + word
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
# 请求地址栏中的url,一般返回的就是一个网页的源代码,一般需要提取信息
# 正则表达式、xpathBeatifulSoup
# post请求一般最好是直接抓取对应的接口
print(content)
baidufanyi2.py
import urllib.request
import urllib.parse
import json

post_url = 'http://fanyi.baidu.com/sug'
# 处理post请求,表单数据是通过data参数(字典类型)进行设置的,
# 不能直接链接到urldata = {
    'kw': 'baby'
}
# POST请求的data数据必须经过urlencode编码
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}

# 伪装浏览器访问
request = urllib.request.Request(url=post_url,data=data,headers=headers)
# request当做url来使用,进行网络请求
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content)
json_str = json.loads(content,encoding='utf-8')

print(json_str)
# 写入json文件
json.dump(json_str,open('result.json','w',encoding='utf-8'),ensure_ascii=False)
baidufanyi3.py
import urllib.request
import urllib.parse
import json

post_url = 'http://fanyi.baidu.com/v2transapi'

headers = {
    # "Accept":"*/*",
    # "Accept-Encoding":"gzip, deflate",
    # "Accept-Language":"zh-CN,zh;q=0.9",
    # "Content-Length":"118",
    "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie":"BIDUPSID=E8CEEE276186AFCBD934D48D6A11FFF4; PSTM=1521687351; BAIDUID=884F472C06A37EC31E475840309A5A65:FG=1; BDUSS=ZtRVBveEVNb0FCTEIwMmNwMGxGNjR1RHh6RVJhMGp2a1cwc21VSlh2fnlpbFZiQVFBQUFBJCQAAAAAAAAAAAEAAACgCkw4MjAxNTEyv7zR0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPL9LVvy~S1bbX; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1529821868; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1529821868; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D",
    # "Host":"fanyi.baidu.com",
    # "Origin":"http://fanyi.baidu.com",
    # "Proxy-Connection":"keep-alive",
    # "Referer":"http://fanyi.baidu.com/",
    # "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36",
    # "X-Requested-With":"XMLHttpRequest",}
#扩展
# 1.sublime中,进行快速替换请求头格式,利用sublime的正则匹配(在此不做说明,想了解的可以私聊)
data = {
    'from': 'en',
    'to': 'zh',
    'query': 'baby',
    # 'transtype': 'enter',
    # 'simple_means_flag': '3',
    'sign': '814534.560887',
    'token': 'd97fe247f748747f2c3495c43378eb25'
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=post_url,headers=headers,data=data)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
json_str = json.loads(content,encoding='utf-8')
print(json_str
总结:爬虫的难点如下
# 如何伪装成一个正常的用户(UAcookie,proxy,验证码,动态请求(seleniun),js加密)
# 总结:所有的请求头信息和参数信息都需要校验那些有用,哪些没用
# 确定有用以后,再去研究参数如何获取的问题

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值