jsoup爬取指定网页的url和图片

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;

/**
 * Created by Administrator on 2016/3/10.
 */
public class Crawler {

    private static final Object signal=new Object();
    private static  int count=0;

    public static String getRequestByURL(String url){
        System.out.println("打开网页----");
        try {
            String entity = TestHTTPClient.get(url);
            return entity;
        } catch (Exception e) {
            System.out.println("打开网页出错---");
            e.printStackTrace();
            return null;
        }
    }

    public static List<String> getArticleURLs(String pageContext){
        List<String> articleURLs=new ArrayList<String>();
        System.out.println("寻找专题");
        Document document = Jsoup.parseBodyFragment(pageContext);
        Elements elements = document.select("span.link_title");
        elements=elements.select("a");
        for (Element element : elements) {
            articleURLs.add(element.attr("href"));
        }
        return articleURLs;
    }


    public static List<String> getImgURLS(String pageContext){
        System.out.println("开始查找图片");
        List<String> imgURLs=new ArrayList<String>();
        Document document = Jsoup.parseBodyFragment(pageContext);
        Elements elements = document.select("a[target=_blank] img[src]");
        for (Element element : elements) {
            imgURLs.add(element.attr("src"));
        }
        return imgURLs;
    }
    public static void savePic(String imgURL,String imgPath){
        if (imgURL==null){
            return;
        }
        String[] str = imgURL.split("/");
        String fileName = str[str.length - 1];
        String savePath=imgPath+ File.separator+fileName;
        HttpGet httpGet=new HttpGet(imgURL);
        CloseableHttpClient httpClient = TestHTTPClient.getHttpClient();
        try {
            CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity entity = httpResponse.getEntity();
            InputStream inputStream = entity.getContent();
            OutputStream outputStream=new FileOutputStream(savePath);
            IOUtils.copy(inputStream,outputStream);

            IOUtils.closeQuietly(inputStream);
            IOUtils.closeQuietly(outputStream);
            System.out.println("保存图片成功!");
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("图片保存失败!");
        }


    }

    public static void begin(final String pageContext){
        for (int i=0;i<10;i++){
            new Thread(new Runnable() {
                @Override
                public void run() {
                    while (true){
                        System.out.println("当前进入线程的是:"+Thread.currentThread().getName());
                        List<String> imgURLS = getImgURLS(pageContext);
                        if (imgURLS!=null&&imgURLS.size()>0){

                        }

                    }

                }
            });
        }
    }

    public static void main(String[] args) {
//        爬取个人csdn目录视图所有文章
//        String url="http://blog.csdn.net/huxiweng/article/list/";
//        int maxPage=6;
//        for (int i=0;i<maxPage;i++){
//            String pageContext= getRequestByURL(url+""+(i+1));
//            System.out.println("开始寻找第"+(i+1)+"页面文章");
//            List<String> articleURLs = getArticleURLs(pageContext);
//            for (String articleURL : articleURLs) {
//                System.out.println(articleURL);
//            }
//        }

        //爬取火狐浏览器图片频道的图片
        final String imgPath="E:/img";
        File file=new File(imgPath);
        if (!file.exists()){
            file.mkdir();
        }
        String url="http://photo.firefox.163.com/";
        System.out.println("begin");
        long begin = System.currentTimeMillis();
        for (int i=0;i<10;i++){
            System.out.println("爬取第"+i+1+"图片");
            String pageContext = getRequestByURL(url+"roll_"+(i+1)+".html");
            List<String> imgURLS = getImgURLS(pageContext);
            for (String imgURL : imgURLS) {
                savePic(imgURL,imgPath);
//            System.out.println(imgURL);
            }
        }


        System.out.println("耗时:"+String.valueOf(System.currentTimeMillis()-begin));


    }
}
其中TestHttpClient用了上一篇文章的类!
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值