JAVA爬虫

此种方法只适用于网页源代码含有所要提取的信息
此例中,我们索要获取信息的网页是:“https://blog.csdn.net/”,其中又包含不同的文章,再进一步获得不同文章的信息。

   import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import javax.net.ssl.*;
    import java.io.IOException;
    import java.security.KeyManagementException;
    import java.security.NoSuchAlgorithmException;
    import java.security.cert.CertificateException;
    import java.security.cert.X509Certificate;
    import java.util.ArrayList;
    import java.util.List;

public class test {
    static
    {
        try
        {
            trustAllHttpsCertificates();
            HttpsURLConnection.setDefaultHostnameVerifier
                    (
                            new HostnameVerifier()
                            {
                                public boolean verify(String urlHostName, SSLSession session)
                                {
                                    return true;
                                }
                            }
                    );
        } catch (Exception e)  {}
    }
    private static void trustAllHttpsCertificates()
            throws NoSuchAlgorithmException, KeyManagementException
    {
        TrustManager[] trustAllCerts = new TrustManager[1];
        trustAllCerts[0] = new TrustAllManager();
        SSLContext sc = SSLContext.getInstance("SSL");
        sc.init(null, trustAllCerts, null);
        HttpsURLConnection.setDefaultSSLSocketFactory(
                sc.getSocketFactory());
    }

    private static class TrustAllManager
            implements X509TrustManager
    {
        public X509Certificate[] getAcceptedIssuers()
        {
            return null;
        }
        public void checkServerTrusted(X509Certificate[] certs,
                                       String authType)
                throws CertificateException
        {
        }
        public void checkClientTrusted(X509Certificate[] certs,
                                       String authType)
                throws CertificateException
        {
        }
    }
    public static List<String>  getText(String Url) throws IOException {
        List<String> urlList = new ArrayList<>();
        String rule = "abs:href";//加abs:获得绝对地址
        //获得网页源代码
        Document document = Jsoup.connect(Url).timeout(4000).ignoreContentType(true).userAgent("Mozilla\" to \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36").get();
        Elements urlNode = document.getElementsByClass("title").select("h2").select("a");
        for(Element element : urlNode){
            String title = element.text();//获取文章名
            urlList.add(element.attr(rule));//获取该网页链接
        }
        return urlList;
    }
    public static void main(String[] args) throws IOException {
        List<String> urlist = getText("https://blog.csdn.net/");
        testThread testThread = new testThread(urlist);
        testThread.run();
    }
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class testThread extends Thread {
    List<String> urlist;
    public testThread(List<String> urlist){
        this.urlist = urlist;
    }
    public static String Path = "D:\\CSDN\\";//读入地址
    public static void createFile(File fileName) throws Exception {
        try {
            if (!fileName.exists()) {
                fileName.createNewFile();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
//写入文本内容
    public static void writeTxtFile(String content, File fileName) throws Exception {
        RandomAccessFile mm = null;
        FileOutputStream o = null;
        try {
            o = new FileOutputStream(fileName);
            o.write(content.getBytes("UTF-8"));
            o.close();
        } catch (Exception e) {

            e.printStackTrace();
        } finally {
            if (mm != null) {
                mm.close();
            }
        }
    }


    public void run(){
        String title;
        String content;
        for (String url : urlist) {
            try {
                Document document = Jsoup.connect(url).timeout(6000).get();
                title = document.select("title").toString();

                Elements co = document.select("#article_content").select("p");
                content = co.html();
                Elements img = co.select("img");
                //创建线程池
                ExecutorService pool =  Executors.newFixedThreadPool(9);
                for(Element element : img){
                    String str = element.attr("src");//获取链接
                    pool.execute(new DownloadTask(str));//下载图片
                }
                pool.shutdown();//释放线程池
                File file = new File(Path+title.replaceAll("<title>", "").replaceAll("</title>","")+".txt");
                createFile(file);
                System.out.println("创建文件:"+file.getPath());
                writeTxtFile(FileterHtml(content), file);

            } catch (IOException e) {
                e.printStackTrace();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }
    //删除不必要的符号
    public String FileterHtml(String content){
        content = content.replaceAll("<br>","\n").replaceAll("<strong>","").replaceAll("</br>","").
                replaceAll("</strong>","");
        Pattern p_scirpt;
        Matcher m_special;
        String regEx_special="&nbsp";
        p_scirpt = Pattern.compile(regEx_special,Pattern.CASE_INSENSITIVE);
        m_special = p_scirpt.matcher(content);
        content = m_special.replaceAll("");
        return content;
    }
}
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

public class DownloadTask implements Runnable{
    String str;
    public DownloadTask(String str){
        this.str = str;
    }
    public void run(){
        HttpURLConnection conn = null;
        InputStream in = null;
        FileOutputStream out = null;

        try {
            conn = (HttpURLConnection) new URL(str).openConnection();
            //读取数据
            in = conn.getInputStream();
            String uu = "D:\\CSDN\\";
            //获得图片的名字
            int index = str.lastIndexOf('/');
            String file = str.substring(index+1);
             file = uu  +file + ".jpg";
            //创建输出流,写入
            out = new FileOutputStream(file);
            byte[] buf = new byte[1024 + 16];
            int size;
            while(-1 != (size = in.read(buf))) {
                out.write(buf, 0, size);
            }
            //下载完成
            String name = Thread.currentThread().getName();
            System.out.println(name + "下载" + file);

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //不论是否发生异常都会执行的
            if(out != null) {
                try {
                    out.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }

            if(conn != null) {
                conn.disconnect();
            }
        }
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值