(1)爬有道翻译(使用代理访问+获取正在使用哪个IP)
import urllib.request
import urllib.parse
import json
import random
# 功能一:使用代理IP进行有道翻译内容
content = input("请输入需要翻译的内容:")
# 有道翻译接口链接
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
# 模拟浏览器请求头参数
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
# 代理IP 举例网址:cn-proxy.com(需翻墙访问)
iplist = ['111.63.135.109:80', '39.137.77.66:80',
'124.156.108.71:82', '101.231.104.82:80']
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
# post提交有道翻译参数data
data = {}
data['i'] = content
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = '15610113458719'
data['sign'] = 'fcecf3fe461cc8aa778d4f3926a2cdc1'
data['ts'] = '1561011345871'
data['bv'] = '3a019e7d0dda4bcd253903675f2209a5'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data = urllib.parse.urlencode(data).encode('utf-8')
# 进行有道翻译接口处理
res = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(res)
html = response.read().decode('utf-8')
# 打印翻译结果
target = json.loads(html)
print("翻译结果: %s" % (target['translateResult'][0][0]['tgt']))
'''
# 功能二:访问第三方网站获取当前使用的代理IP
# 测试IP地址链接
# url2='https://ip.cn/'
# url2='http://www.bejson.com/httputil/queryip/'
url2 = 'http://ip.myhostadmin.net/'
# 代理IP 举例网址:cn-proxy.com(需翻墙访问)
# 测试访问ip网址 www.whatismyip.com 这个使用有次数限制,用其他的
iplist = ['111.63.135.109:80', '39.137.77.66:80',
'124.156.108.71:82', '101.231.104.82:80']
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
# 模拟浏览器请求头参数
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
# 进行获取访问IP接口处理
res = urllib.request.Request(url2, headers=head)
response2 = urllib.request.urlopen(res)
html2 = response2.read().decode('gb2312') # 编码格式根据获取IP的网站定义来写
# 打印代理IP结果
print(html2)
'''
(2)爬取煎蛋网ooxx栏目图片
# coding=utf-8
import urllib.request as tt
import os
import random
# url_open_api为封装的打开文件方法
def url_open_api(url):
req = tt.Request(url)
req.add_header(
'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36')
# 加入代理
'''
iplist = ['163.204.241.8:9999', '163.125.220.161:8118',
'58.22.212.102:9000', '124.93.201.59:42672']
proxy_support = tt.ProxyHandler({'http': random.choice(iplist)})
opener = tt.build_opener(proxy_support)
tt.install_opener(opener)
'''
# 代理结束
response = tt.urlopen(url)
html = response.read()
return html
def get_page(url):
html = url_open_api(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
return html[a:b]
def find_imgs(url):
html = url_open_api(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a + 255)
if b != -1:
img_addrs.append(html[a + 9:b + 4])
else:
b = a + 9
a = html.find('img src=', b)
return img_addrs
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
each_url = 'http:' + each
img = url_open_api(each_url)
f.write(img)
def download_mm(folder='OOXX', pages=10):
os.mkdir(folder)
os.chdir(folder)
url = "http://jiandan.net/ooxx/"
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
page_url = url + 'page-' + str(page_num) + '#comments'
img_addrs = find_imgs(page_url)
save_imgs(folder, img_addrs)
if __name__ == '__main__':
download_mm()