webMagic 代理池

通过抓取使用高匿代理,防止IP被封禁。

一个小小的例子

pom

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

代码

package com.example.csdn.bean;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;

import java.io.IOException;
import java.util.*;


public class Main implements AfterExtractor {
    //失败次数
    private static int errorCount = 0;
    //重新抓取阙值
    private static final float reLoadProxy = 0.45f;
    //最大代理数
    private static final int maxProxySize = 200;
	//代理池
    private static Map<String,MyProxy> proxyArr = new HashMap<>();
	//csdn 地址
    private static Map<Integer, String> blogUrl = new HashMap<>();
	//博客数量
    private static int blogUrlSize = 0;
    //csdn我的博客 地址
    private static final String blogHome = "https://blog.csdn.net/qq_36183235";
	
    private static Logger logger = Logger.getLogger(Main.class);

    public static void main(String[] args) {
        while (true) {
			//加载博客链接
            loadBlogUrl();
            //抓取代理
            loadProxy();

			
            for (final String key : proxyArr.keySet()) {
                if (!proxyArr.get(key).getState())
                    break;
                System.getProperties().setProperty("https.proxyHost", proxyArr.get(key).getAddr());
                System.getProperties().setProperty("https.proxyPort", proxyArr.get(key).getPort());
                try {
                    Jsoup.connect(blogUrl.get(randomBlogUrl()))
                            .userAgent("Mozilla")
                            .cookie("auth", "token")
                            .timeout(5000)
                            .get();
                    System.out.println("complete !");
                } catch (IOException e) {
                    errorCount++;
                    proxyArr.get(key).setState(false);
                    e.printStackTrace();
                }
                sleepThread((new Random().nextInt(100)) + 30);
            }
        }
    }

    private static void loadBlogUrl() {
        try {
            Document doc = Jsoup.connect(blogHome).post();
            Elements h4 = doc.body().getElementsByClass("article-list").select("h4");
            for (int i = 0; i < h4.size(); i++) {
                blogUrl.put(i, h4.get(i).select("a").attr("href"));
            }
            blogUrlSize = h4.size();
            System.out.println("blog !");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    public static void loadProxy() {
        if(errorCount > maxProxySize * reLoadProxy)
            proxyArr.clear();

        if(proxyArr.size() >= maxProxySize)
            return;


        OOSpider.create(Site.me().setSleepTime(60*1000)
                , Main.class)
                .setIsExtractLinks(false)
                .addUrl(links().toArray(new String [0]))
                .run();
    }

    public static List<String> links (){
        List<String > var1 = new  ArrayList<>();
        for (int i = 1;i<maxProxySize/15 ; i++) {
            var1.add(String.format("https://www.kuaidaili.com/free/inha/%s/",i));
        }
        return  var1;
    }
	//睡眠一定的时间 防止被封禁
    public static void sleepThread(int s) {
        try {
            long ms = s * 1000;
            Thread.sleep(ms);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static int randomBlogUrl() {
        return new Random().nextInt(blogUrlSize);
    }

    @Override
    public void afterProcess(Page page) {

        if (proxyArr.size() >= maxProxySize)
            return;
        for (int i = 1; i < 15; i++) {
            MyProxy proxy = new MyProxy();
            String addr = String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[1]/text()", i)));
            proxy.setAddr(addr);
            proxy.setPort(String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[2]/text()", i))));
            proxy.setState(true);
            proxyArr.put(addr,proxy);
        }
        System.out.println("get proxy! Size : "+proxyArr.size());
    }

    static class MyProxy {

        private String addr;
        private String port;
        private Boolean state;

        public String getAddr() {
            return addr;
        }

        public void setAddr(String addr) {
            this.addr = addr;
        }

        public String getPort() {
            return port;
        }

        public void setPort(String port) {
            this.port = port;
        }

        public Boolean getState() {
            return state;
        }

        public void setState(Boolean state) {
            this.state = state;
        }
    }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值