import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
/**
* Created by Administrator on 2016/3/10.
*/
public class Crawler {
private static final Object signal=new Object();
private static int count=0;
public static String getRequestByURL(String url){
System.out.println("打开网页----");
try {
String entity = TestHTTPClient.get(url);
return entity;
} catch (Exception e) {
System.out.println("打开网页出错---");
e.printStackTrace();
return null;
}
}
public static List<String> getArticleURLs(String pageContext){
List<String> articleURLs=new ArrayList<String>();
System.out.println("寻找专题");
Document document = Jsoup.parseBodyFragment(pageContext);
Elements elements = document.select("span.link_title");
elements=elements.select("a");
for (Element element : elements) {
articleURLs.add(element.attr("href"));
}
return articleURLs;
}
public static List<String> getImgURLS(String pageContext){
System.out.println("开始查找图片");
List<String> imgURLs=new ArrayList<String>();
Document document = Jsoup.parseBodyFragment(pageContext);
Elements elements = document.select("a[target=_blank] img[src]");
for (Element element : elements) {
imgURLs.add(element.attr("src"));
}
return imgURLs;
}
public static void savePic(String imgURL,String imgPath){
if (imgURL==null){
return;
}
String[] str = imgURL.split("/");
String fileName = str[str.length - 1];
String savePath=imgPath+ File.separator+fileName;
HttpGet httpGet=new HttpGet(imgURL);
CloseableHttpClient httpClient = TestHTTPClient.getHttpClient();
try {
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
HttpEntity entity = httpResponse.getEntity();
InputStream inputStream = entity.getContent();
OutputStream outputStream=new FileOutputStream(savePath);
IOUtils.copy(inputStream,outputStream);
IOUtils.closeQuietly(inputStream);
IOUtils.closeQuietly(outputStream);
System.out.println("保存图片成功!");
} catch (Exception e) {
e.printStackTrace();
System.out.println("图片保存失败!");
}
}
public static void begin(final String pageContext){
for (int i=0;i<10;i++){
new Thread(new Runnable() {
@Override
public void run() {
while (true){
System.out.println("当前进入线程的是:"+Thread.currentThread().getName());
List<String> imgURLS = getImgURLS(pageContext);
if (imgURLS!=null&&imgURLS.size()>0){
}
}
}
});
}
}
public static void main(String[] args) {
// 爬取个人csdn目录视图所有文章
// String url="http://blog.csdn.net/huxiweng/article/list/";
// int maxPage=6;
// for (int i=0;i<maxPage;i++){
// String pageContext= getRequestByURL(url+""+(i+1));
// System.out.println("开始寻找第"+(i+1)+"页面文章");
// List<String> articleURLs = getArticleURLs(pageContext);
// for (String articleURL : articleURLs) {
// System.out.println(articleURL);
// }
// }
//爬取火狐浏览器图片频道的图片
final String imgPath="E:/img";
File file=new File(imgPath);
if (!file.exists()){
file.mkdir();
}
String url="http://photo.firefox.163.com/";
System.out.println("begin");
long begin = System.currentTimeMillis();
for (int i=0;i<10;i++){
System.out.println("爬取第"+i+1+"图片");
String pageContext = getRequestByURL(url+"roll_"+(i+1)+".html");
List<String> imgURLS = getImgURLS(pageContext);
for (String imgURL : imgURLS) {
savePic(imgURL,imgPath);
// System.out.println(imgURL);
}
}
System.out.println("耗时:"+String.valueOf(System.currentTimeMillis()-begin));
}
}
其中TestHttpClient用了上一篇文章的类!
jsoup爬取指定网页的url和图片
最新推荐文章于 2023-11-03 14:57:57 发布