任务,用java基于线程池多线程爬取豆瓣网热门top250的电影。
废话不多说,直接上源码。
实体类:Movie
/**
* @author Wlient
* @date 2020/9/15 14:30
*/
public class Movie {
private String moviename;
private String rate;
private String url;
public String getMoviename() {
return moviename;
}
public void setMoviename(String moviename) {
this.moviename = moviename;
}
public String getRate() {
return rate;
}
public void setRate(String rate) {
this.rate = rate;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Movie{" +
"moviename='" + moviename + '\'' +
", rate='" + rate + '\'' +
", url='" + url + '\'' +
'}';
}
}
爬取信息:MoreMovice
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* @author Wlient
* @date 2020/9/15 14:54
*/
public class MoreMovice implements Runnable {
private String url;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public MoreMovice(String url) {
this.url = url;
}
public void run() {
Document doc = null;
try {
doc = Jsoup.connect(this.url).ignoreContentType(true).timeout(20000).get();
String data = doc.getElementsByTag("body").text();
System.out.println(data);
JSONObject jsonObject = new JSONObject(data);
JSONArray jsonArray = jsonObject.getJSONArray("subjects");
Movie movie = new Movie();
for (int i = 0; i < jsonArray.length(); i++) {
JSONObject mov = jsonArray.getJSONObject(i);
movie.setMoviename(mov.getString("title"));
movie.setRate(mov.getString("rate"));
movie.setUrl(mov.getString("cover"));
System.out.println(movie);
write(mov.getString("cover"),mov.getString("title"));
}
} catch (IOException e) {
e.printStackTrace();
} catch (JSONException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void write(String urlString, String title) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
String filename = "豆瓣热门电影top250/" + title + ".jpg";
//下载路径及下载图片名称
File file = new File(filename);
if(!file.exists()){
file.getParentFile().mkdirs();
}
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
}
主类:SpiderDemo
/**
* @author Wlient
* @date 2020/9/14 23:29
*/
import org.json.JSONException;
import java.io.IOException;
import java.net.URL;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class SpiderDemo {
public static void main(String[] args) throws IOException, JSONException {
String url;
ExecutorService cachedThreadPool = Executors.newFixedThreadPool(5);
for(int i=0;i<=250;i+=50){
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=50&page_start="+i;
Runnable moreMovice = new MoreMovice(url);
Thread thread1 = new Thread(moreMovice);
cachedThreadPool.execute(thread1);
}
}
}
结果: