Jsoup解析下载CSDN,博客园文章

参考其他博主的文章,并在原来的基础上添加了博客园文章下载的方式,本来想引用一下原文的,但是发现找不到了,后面找到了再把原博客链接补上


import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

/**
 *
 * Jsoup解析网页
 *      maven依赖
 *       <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
 *         <dependency>
 *             <groupId>org.jsoup</groupId>
 *             <artifactId>jsoup</artifactId>
 *             <version>1.13.1</version>
 *         </dependency>
 *         <!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
 *         <dependency>
 *             <groupId>org.apache.poi</groupId>
 *             <artifactId>poi</artifactId>
 *             <version>4.1.2</version>
 *         </dependency>
 *         <!-- https://mvnrepository.com/artifact/org.apache.directory.studio/org.apache.commons.io -->
 *         <dependency>
 *             <groupId>org.apache.directory.studio</groupId>
 *             <artifactId>org.apache.commons.io</artifactId>
 *             <version>2.4</version>
 *         </dependency>
 *
 */
public class JsoupDownload {

    public static void main(String[] args) throws IOException, URISyntaxException {

        //下载地址
        String download="F:/upload/xxxx.html";
        // String doc="F:/upload/jxl-excel.doc";
        
        //CSDN网页url
        String uri="https://blog.csdn.net/Sun_Loading/article/details/125892377";
        //下载
        writeCSDNHtmlFile(uri,new File(download));
        //writeCSDNWordFile(uri,new File(doc));

        //博客园下载
        //String b_uri="https://www.cnblogs.com/glb79809-glb/p/14365629.html";
        //writeBKYHtmlFile(b_uri,new File(download));
        //下载doc
        //String doc="F:/upload/nodejs的版本管理工具nvm1.docx";
        //writeBKYWordFile(b_uri,new File(doc));



    }

    public static void test1() throws IOException {
        Document document = Jsoup.connect("http://www.baidu.com").get();
        System.out.println(document.title());

    }


    /**从csdn截取正文
     * @param uriStr
     * @throws IOException
     */
    public static void writeCSDNHtmlFile(String uriStr,File localFile) throws IOException {
        FileUtils.writeStringToFile(localFile,readHtml(uriStr),"UTF-8");
    }

    /**从bky截取正文
     * @param uriStr
     * @throws IOException
     */
    public static void writeBKYHtmlFile(String uriStr,File localFile) throws IOException {
        FileUtils.writeStringToFile(localFile,readHtmlBKY(uriStr),"UTF-8");
    }

    /**从csdn截取正文
     * @param uriStr
     * @throws IOException
     */
    public static void writeCSDNWordFile(String uriStr,File wordFile)  {
        writeDocFile(wordFile,readHtml(uriStr),"UTF-8");
    }

    /**从博客园截取正文
     * @param uriStr
     * @throws IOException
     */
    public static void writeBKYWordFile(String uriStr,File wordFile)  {
        writeDocFile(wordFile,readHtmlBKY(uriStr),"UTF-8");
    }
    
    /**
     * 从csdn读取和加工正文
     * @param uriStr
     * @return
     */
    private static String readHtml(String uriStr){
        StringBuffer sb=new StringBuffer();
        sb.append("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">");
        try {
            URI uri=new URI(uriStr);
            Document doc= Jsoup.parse(uri.toURL(),10000);
            //解决页面不能复制的问题
            Elements style = doc.select("style");
            style.remove(0);
            sb.append(style.outerHtml());
            Elements elements=doc.select("link[rel=\"stylesheet\"]");
            String url=null;
            for(Element element:elements){
                url=element.attr("href");
                sb.append("<style type=\"text/css\" url='");
                sb.append(url);
                sb.append("'>");
                sb.append(IOUtils.toString(new URI(url),"UTF-8"));
                sb.append("</style>");
                sb.append("\r\n");
            }
            sb.append("<style type=\"text/css\">");
            sb.append("#article_content,.fontclass{font-family:\"Microsoft YaHei\"}");
            sb.append("</style>");
            sb.append("</head><body style=\"padding: 100px\">");
            sb.append("<h1 class='title-article'>下载地址:<b>").append(uriStr).append("</b></h1>");
            doc.select("#article_content p").toggleClass("fontclass");
            sb.append(doc.select(".blog-content-box").html());
            sb.append("</body></html>");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

    /**
     * 博客园
     * @param uriStr
     * @return
     */
    private static String readHtmlBKY(String uriStr){
        StringBuffer sb=new StringBuffer();
        sb.append("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">");
        try {
            URI uri=new URI(uriStr);
            Document doc= Jsoup.parse(uri.toURL(),10000);
            sb.append(doc.select("style").outerHtml());
            Elements elements=doc.select("link[rel=\"stylesheet\"]");
            String url=null;
            for(Element element:elements){
                url=element.attr("href");
                sb.append("<style type=\"text/css\" url='");
                sb.append(url);
                sb.append("'>");
                sb.append(IOUtils.toString(new URI("https://www.cnblogs.com"+url),"UTF-8"));
                sb.append("</style>");
                sb.append("\r\n");
            }
            sb.append("<style type=\"text/css\">");
            sb.append("#article_content,.fontclass{font-family:\"Microsoft YaHei\"}");
            sb.append("</style>");
            sb.append("</head><body style=\"padding: 100px\">");
            sb.append("<h1 class='title-article'>下载地址:<b>").append(uriStr).append("</b></h1>");
            doc.select("#article_content p").toggleClass("fontclass");
            sb.append(doc.select("#topics").outerHtml());
            sb.append("</body></html>");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

    public static boolean writeDocFile( File file, String html,String encoding) {
        boolean w = false;
        File fileDir=file.getParentFile();
        if (!fileDir.exists()) {
            fileDir.mkdirs();
        }
        try {
            byte b[] = html.getBytes(encoding);
            ByteArrayInputStream bais = new ByteArrayInputStream(b);
            POIFSFileSystem poifs = new POIFSFileSystem();
            DirectoryEntry directory = poifs.getRoot();
            DocumentEntry documentEntry = directory.createDocument(
                    "WordDocument", bais);
            FileOutputStream ostream = new FileOutputStream(file);
            poifs.writeFilesystem(ostream);
            bais.close();
            ostream.close();
        }catch(IOException e){
            e.printStackTrace();
        }
        return w;
    }
}

csdn下载效果
在这里插入图片描述

博客园下载效果
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值