Java多线程爬虫

package 爬虫;
import 多线程爬虫.DownloadImage;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class pachong2 {
    public static void main(String[] args) {
        //Set<String> url = new TreeSet<>();
        try{
            //创建一个线程池
            ExecutorService pool = Executors.newFixedThreadPool(100);
            //获取指定网页源码
            getUrl(pool);
            //遍历set中图片的url
//            for(String imageUrl:url){
//                pool.execute(new DownloadImage(imageUrl));
//            }
            pool.shutdown();
        }catch(Exception e){
            e.printStackTrace();
        }
    }
    //获取图片网站的函数Document
    public static void getUrl(ExecutorService pool) {
        String strurl = "http://www.bookschina.com/kinder/27000000/";
        try {
            //先拿到这个url
            URL url = new URL(strurl);
            //通过url建立与网页之间的连接
            URLConnection com = url.openConnection();
            //通过这个连接呢,取得这网页返回回来的数据
            InputStream word = com.getInputStream();
            System.out.println(com.getContentEncoding());

            //按行读取网页的数据,并进行内容分析
            //使用BufferedReader和InputStreamReader把字节流转化成字符流的缓冲流(=。= 禁止套娃)
            //字符编码设置gbk或者utf8
            BufferedReader br = new BufferedReader(new InputStreamReader(word, "GBK"));
            String line = null;
            while ((line = br.readLine()) != null) {
                boolean status = line.contains("class=\"lazyImg\"");
                String re = "(?<=data-original=\")http:.*?.jpg";
                Pattern p = Pattern.compile(re);
                if (status) {
                    Matcher m = p.matcher(line);
                    while (m.find()) {
                        System.out.println(m.group());
                        pool.execute(new DownloadImage(m.group()));
                        System.out.println(pool);
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

package 爬虫;
import java.io.*;
import java.net.*;


//Thread和Runnable的实质是继承关系,没有可比性。无论使用Runnable还是Thread,都会new Thread,然后执行run方法。
// 用法上,如果有复杂的线程操作需求,那就选择继承Thread,如果只是简单的执行一个任务,那就实现runnable。
public class DownloadImage implements Runnable{
    String downUrl;
    public DownloadImage(String downUrl){
        this.downUrl = downUrl;
    }
    public void run(){
        BufferedInputStream enter = null;
        FileOutputStream writer = null;
        try{
            //生成url对象
            URL url = new URL(downUrl);
            //创建urlconnection对象
            URLConnection uc = url.openConnection();
            //获取uc的输入流
            enter = new BufferedInputStream(uc.getInputStream());
            //创建图片的存储对象
            String[] p = downUrl.split("/");
            String path = "E:\\text\\"+p[p.length-1];
            writer = new FileOutputStream(path);
            int c;
            while((c=enter.read())!=-1){
                writer.write(c);
                System.out.println(c);
            }
        }catch(Exception e){
            System.out.println(e);
        }
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值