使用Java实现爬虫
话不多说,直接开干。
1.1引入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.8</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
1.2编写主要代码
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
public class Test {
public static void main(String[] args) throws Exception {
ThreadPoolExecutor executor = new ThreadPoolExecutor(20, 20, 1, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
//翻页使用
int pageNum = 0;
//电影排名计数
int count = 1;
for (int i = 0; i < 10; i++) {
String url = "https://movie.douban.com/top250?start=" + pageNum + "&filter=";
//解析网页(Jsoup返回Document就是浏览器Document对象)
Document document = Jsoup.connect(url).header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0").get();
//获取所有li标签
Elements name = document.getElementsByClass("article");
for (Element element : name) {
Elements li = element.getElementsByTag("li");
for (Element element1 : li) {
//获取标题
String title = count + "-" + element1.getElementsByTag("img").attr("alt");
//获取图片地址
String attr = element1.getElementsByTag("img").attr("src");
//下载图片
executor.submit(new DownloadRunnable(attr, "/Users/XXX/Desktop/img", title));
count++;
}
}
pageNum += 25;
}
executor.shutdown();
}
}
1.3 编写多线程下载任务
import org.springframework.util.StringUtils;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
public class DownloadRunnable implements Runnable{
private String urlPath;
private String targetDirectory;
private String title;
public DownloadRunnable(String urlPath, String targetDirectory, String title) {
this.urlPath = urlPath;
this.targetDirectory = targetDirectory;
this.title = title;
}
@Override
public void run() {
try {
download(urlPath,targetDirectory,title);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public void download(String urlPath , String targetDirectory,String title) throws Exception {
// 解决url中可能有中文情况
System.out.println("url:"+ urlPath);
URL url = new URL(urlPath);
HttpURLConnection http = (HttpURLConnection)url.openConnection();
http.setConnectTimeout(3000);
// 设置 User-Agent 避免被拦截
http.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)");
String contentType = http.getContentType();
System.out.println("contentType: "+ contentType);
// 获取文件大小
long length = http.getContentLengthLong();
System.out.println("文件大小:"+(length / 1024)+"KB");
// 获取文件名
String fileName = getFileName(http , urlPath,title);
InputStream inputStream = http.getInputStream();
byte[] buff = new byte[1024*10];
OutputStream out = new FileOutputStream(new File(targetDirectory,fileName));
int len ;
int count = 0; // 计数
while((len = inputStream.read(buff)) != -1) {
out.write(buff, 0, len);
out.flush();
++count ;
}
System.out.println("count:"+ count);
// 关闭资源
out.close();
inputStream.close();
http.disconnect();
}
private String getFileName(HttpURLConnection http , String urlPath,String title) throws UnsupportedEncodingException {
String headerField = http.getHeaderField("Content-Disposition");
String fileName = null ;
if(null != headerField) {
String decode = URLDecoder.decode(headerField, "UTF-8");
fileName = decode.split(";")[1].split("=")[1].replaceAll("\"", "");
System.out.println("文件名是: "+ fileName);
}
if(null == fileName) {
// 尝试从url中获取文件名
String[] arr = urlPath.split("/");
fileName = arr[arr.length - 1];
System.out.println("url中获取文件名:"+ fileName);
}
if (!StringUtils.isEmpty(title)){
fileName = title+"."+fileName.split("\\.")[1];
}
return fileName;
}
}