python 爬虫代码(有道翻译+煎蛋网图片)

(1)爬有道翻译(使用代理访问+获取正在使用哪个IP)

import urllib.request
import urllib.parse
import json
import random


# 功能一:使用代理IP进行有道翻译内容

content = input("请输入需要翻译的内容:")

# 有道翻译接口链接
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'


# 模拟浏览器请求头参数
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'


# 代理IP  举例网址:cn-proxy.com(需翻墙访问)
iplist = ['111.63.135.109:80', '39.137.77.66:80',
          '124.156.108.71:82', '101.231.104.82:80']
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)


# post提交有道翻译参数data
data = {}

data['i'] = content
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = '15610113458719'
data['sign'] = 'fcecf3fe461cc8aa778d4f3926a2cdc1'
data['ts'] = '1561011345871'
data['bv'] = '3a019e7d0dda4bcd253903675f2209a5'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'

data = urllib.parse.urlencode(data).encode('utf-8')

# 进行有道翻译接口处理

res = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(res)
html = response.read().decode('utf-8')


# 打印翻译结果
target = json.loads(html)
print("翻译结果: %s" % (target['translateResult'][0][0]['tgt']))

'''
# 功能二:访问第三方网站获取当前使用的代理IP

# 测试IP地址链接
# url2='https://ip.cn/'
# url2='http://www.bejson.com/httputil/queryip/'
url2 = 'http://ip.myhostadmin.net/'


# 代理IP  举例网址:cn-proxy.com(需翻墙访问)
# 测试访问ip网址  www.whatismyip.com 这个使用有次数限制,用其他的
iplist = ['111.63.135.109:80', '39.137.77.66:80',
          '124.156.108.71:82', '101.231.104.82:80']
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

# 模拟浏览器请求头参数
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

# 进行获取访问IP接口处理
res = urllib.request.Request(url2, headers=head)
response2 = urllib.request.urlopen(res)
html2 = response2.read().decode('gb2312')  # 编码格式根据获取IP的网站定义来写


# 打印代理IP结果
print(html2)
'''

(2)爬取煎蛋网ooxx栏目图片

# coding=utf-8


import urllib.request as tt
import os
import random

# url_open_api为封装的打开文件方法


def url_open_api(url):
    req = tt.Request(url)
    req.add_header(
        'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36')

    # 加入代理
    '''
    iplist = ['163.204.241.8:9999', '163.125.220.161:8118',
              '58.22.212.102:9000', '124.93.201.59:42672']
    proxy_support = tt.ProxyHandler({'http': random.choice(iplist)})
    opener = tt.build_opener(proxy_support)
    tt.install_opener(opener)
	'''
    # 代理结束

    response = tt.urlopen(url)
    html = response.read()
    return html


def get_page(url):
    html = url_open_api(url).decode('utf-8')
    a = html.find('current-comment-page') + 23
    b = html.find(']', a)
    return html[a:b]


def find_imgs(url):
    html = url_open_api(url).decode('utf-8')
    img_addrs = []
    a = html.find('img src=')

    while a != -1:
        b = html.find('.jpg', a, a + 255)
        if b != -1:
            img_addrs.append(html[a + 9:b + 4])
        else:
            b = a + 9
        a = html.find('img src=', b)
    return img_addrs


def save_imgs(folder, img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename, 'wb') as f:
            each_url = 'http:' + each
            img = url_open_api(each_url)
            f.write(img)


def download_mm(folder='OOXX', pages=10):
    os.mkdir(folder)
    os.chdir(folder)
    url = "http://jiandan.net/ooxx/"
    page_num = int(get_page(url))
    for i in range(pages):
        page_num -= i
        page_url = url + 'page-' + str(page_num) + '#comments'
        img_addrs = find_imgs(page_url)
        save_imgs(folder, img_addrs)


if __name__ == '__main__':
    download_mm()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值