Java也可以爬取网页资源

任务,用java基于线程池多线程爬取豆瓣网热门top250的电影。
废话不多说,直接上源码。
实体类:Movie

/**
 * @author Wlient
 * @date 2020/9/15 14:30
 */
public class Movie {
    private String moviename;
    private String rate;
    private String url;

    public String getMoviename() {
        return moviename;
    }

    public void setMoviename(String moviename) {
        this.moviename = moviename;
    }

    public String getRate() {
        return rate;
    }

    public void setRate(String rate) {
        this.rate = rate;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    @Override
    public String toString() {
        return "Movie{" +
                "moviename='" + moviename + '\'' +
                ", rate='" + rate + '\'' +
                ", url='" + url + '\'' +
                '}';
    }
}

爬取信息:MoreMovice

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

/**
 * @author Wlient
 * @date 2020/9/15 14:54
 */
public class MoreMovice implements Runnable {
    private String url;

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public MoreMovice(String url) {
        this.url = url;
    }

    public void run() {
        Document doc = null;
        try {
            doc = Jsoup.connect(this.url).ignoreContentType(true).timeout(20000).get();
            String data = doc.getElementsByTag("body").text();
            System.out.println(data);
            JSONObject jsonObject = new JSONObject(data);
            JSONArray jsonArray = jsonObject.getJSONArray("subjects");
            Movie movie = new Movie();
            for (int i = 0; i < jsonArray.length(); i++) {
                JSONObject mov = jsonArray.getJSONObject(i);
                movie.setMoviename(mov.getString("title"));
                movie.setRate(mov.getString("rate"));
                movie.setUrl(mov.getString("cover"));
                System.out.println(movie);
                write(mov.getString("cover"),mov.getString("title"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (JSONException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public static void write(String urlString, String title) throws Exception {
        // 构造URL
        URL url = new URL(urlString);
        // 打开连接
        URLConnection con = url.openConnection();
        // 输入流
        InputStream is = con.getInputStream();
        // 1K的数据缓冲
        byte[] bs = new byte[1024];
        // 读取到的数据长度
        int len;
        // 输出的文件流
        String filename = "豆瓣热门电影top250/" + title + ".jpg";
        //下载路径及下载图片名称
        File file = new File(filename);
        if(!file.exists()){
            file.getParentFile().mkdirs();
        }
        FileOutputStream os = new FileOutputStream(file, true);
        // 开始读取
        while ((len = is.read(bs)) != -1) {
            os.write(bs, 0, len);
        }
        // 完毕,关闭所有链接
        os.close();
        is.close();
    }
}

主类:SpiderDemo

/**
 * @author Wlient
 * @date 2020/9/14 23:29
 */

import org.json.JSONException;

import java.io.IOException;
import java.net.URL;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class SpiderDemo {
    public static void main(String[] args) throws IOException, JSONException {
        String url;
        ExecutorService cachedThreadPool = Executors.newFixedThreadPool(5);

        for(int i=0;i<=250;i+=50){
            url = "https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=50&page_start="+i;
            Runnable moreMovice = new MoreMovice(url);
            Thread thread1 = new Thread(moreMovice);
            cachedThreadPool.execute(thread1);
        }
    }
}

结果:
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值