爬取代理ip

import random
import time

import requests
from lxml import etree
from fake_useragent import UserAgent
import csv


proxies = {
    'http':'http://187.87.39.247:31793',
    'https':'https://187.87.39.247:31793'
}

def get_random_ua():
    ua = UserAgent()
    return ua.random

def get_ip_list(url):
    headers = {'User-Agent':get_random_ua()}
    html = requests.get(url=url,headers=headers).text
    #解析
    parse_html = etree.HTML(html)
    r_list = parse_html.xpath('//tr')
    proxy_list = []
    # 依次遍历
    for r in r_list[1:]:
        ip = r.xpath('./td[1]/text()')[0].strip()
        port = r.xpath('./td[2]/text()')[0].strip()
        proxy_list.append(
            {
                'http': 'http://{}:{}'.format(ip, port),
                'https': 'https://{}:{}'.format(ip, port)
            }
        )
    return proxy_list

def proxy_pool(url):

    proxy_list=get_ip_list(url)
    useful_proxy=[]

    for proxy in proxy_list:
        headers = {'User-Agent':get_random_ua()}
        try:
            res = requests.get(
                url='http://httpbin.org/get',
                headers=headers,
                proxies=proxy,
                timeout=5
            )
            print(res.text)
            useful_proxy.append(proxy)
            print(useful_proxy)
        except Exception as e:
            print('{}不能用'.format(proxy))
            continue
    with open('ip.csv','a') as f:
        for i in useful_proxy:
            writer = csv.writer(f)
            writer.writerow([i['http'],i['https']])


if __name__ == '__main__':
    url = 'http://www.89ip.cn/index_{}.html'
    for i in range(2,100):
        url=url.format(i)
        proxy_pool(url)
        time.sleep(random.randint(1,3))














评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值