python 爬取 全网代理 IP 网站 + 破解端口加密混淆

python 爬取 全网代理 IP 网站

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from lxml import etree
import requests


def spider():
    url = 'http://www.goubanjia.com/'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'UM_distinctid=168132a602e1aa-03790ea652ad7b-58422116-1fa400-168132a602f615; JSESSIONID=E0498A8975ACD4E859943603D02E58F4; CNZZDATA1253707717=1042893761-1546504875-null%7C1546581709',
        'Host': 'www.goubanjia.com',
        'Referer': 'http://www.goubanjia.com/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    }

    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    html = etree.HTML(res.text)
    # 类型list
    type = html.xpath("//td/a[@class='href' and contains(@title,'http')]/text()")
    # 假端口list
    fake_port_list = [i[5:] for i in html.xpath('//td[@class="ip"]/span[last()]/@class')]

    alpha = 'ABCDEFGHIZ'
    real_port = []
    for fake_port in fake_port_list:
        num = ''
        for i in fake_port:
            num += str(alpha.index(i))
        real_port.append(str(int(num) // 8))

    tds = html.xpath(".//table[@class='table table-hover']/tbody/tr/td[1]")
    # ip_list list
    ip_list = []
    for td in tds:
        ip = "".join(td.xpath("./*[not(contains(@style,'none')) and not(contains(@class,'port'))]/text()"))
        ip_list.append(ip)
    # ip + port
    ip_port = [i[0] + ':' + i[1] for i in list(zip(ip_list, real_port))]
    result = []
    for index in range(len(ip_port)):
        result.append({type[index]: ip_port[index]})
    print(result)


if __name__ == '__main__':
    spider()

 完整代码下载:https://github.com/tanjunchen/SpiderProject/tree/master/IP 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

远方的飞猪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值