baidufanyi1.py
import urllib.request
import urllib.parse
url = 'http://fanyi.baidu.com/#en/zh/'
word = input('请输入要查询的单词')
url = url + word
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
# 请求地址栏中的url,一般返回的就是一个网页的源代码,一般需要提取信息
# 正则表达式、xpath、BeatifulSoup
# post请求一般最好是直接抓取对应的接口
print(content)
baidufanyi2.py
import urllib.request
import urllib.parse
import json
post_url = 'http://fanyi.baidu.com/sug'
# 处理post请求,表单数据是通过data参数(字典类型)进行设置的,
# 不能直接链接到url中
data = {
'kw': 'baby'
}
# POST请求的data数据必须经过urlencode编码
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
# 伪装浏览器访问
request = urllib.request.Request(url=post_url,data=data,headers=headers)
# 把request当做url来使用,进行网络请求
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content)
json_str = json.loads(content,encoding='utf-8')
print(json_str)
# 写入json文件
json.dump(json_str,open('result.json','w',encoding='utf-8'),ensure_ascii=False)
baidufanyi3.py
import urllib.request
import urllib.parse
import json
post_url = 'http://fanyi.baidu.com/v2transapi'
headers = {
# "Accept":"*/*",
# "Accept-Encoding":"gzip, deflate",
# "Accept-Language":"zh-CN,zh;q=0.9",
# "Content-Length":"118",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"Cookie":"BIDUPSID=E8CEEE276186AFCBD934D48D6A11FFF4; PSTM=1521687351; BAIDUID=884F472C06A37EC31E475840309A5A65:FG=1; BDUSS=ZtRVBveEVNb0FCTEIwMmNwMGxGNjR1RHh6RVJhMGp2a1cwc21VSlh2fnlpbFZiQVFBQUFBJCQAAAAAAAAAAAEAAACgCkw4MjAxNTEyv7zR0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPL9LVvy~S1bbX; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1529821868; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1529821868; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D",
# "Host":"fanyi.baidu.com",
# "Origin":"http://fanyi.baidu.com",
# "Proxy-Connection":"keep-alive",
# "Referer":"http://fanyi.baidu.com/",
# "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36",
# "X-Requested-With":"XMLHttpRequest",}
#扩展
# 1.在sublime中,进行快速替换请求头格式,利用sublime的正则匹配(在此不做说明,想了解的可以私聊)
data = {
'from': 'en',
'to': 'zh',
'query': 'baby',
# 'transtype': 'enter',
# 'simple_means_flag': '3',
'sign': '814534.560887',
'token': 'd97fe247f748747f2c3495c43378eb25'
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=post_url,headers=headers,data=data)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
json_str = json.loads(content,encoding='utf-8')
print(json_str)
总结:爬虫的难点如下
# 如何伪装成一个正常的用户(UA,cookie,proxy,验证码,动态请求(seleniun),js加密)
# 总结:所有的请求头信息和参数信息都需要校验那些有用,哪些没用
# 确定有用以后,再去研究参数如何获取的问题