csdn博客的正文的图片一般不能直接复制,需要复制到微信等再粘贴出来,比较麻烦
因此写了个工具类,方便进行下载,可以方便的生成html和word
最关键的是,可以原封不动的保留格式,顶多细节有小出入
需要jsoup和poi
package test.test2019;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
/**
* Created by admin on 2019/1/15.
*/
public class JsoupTest {
/**
* 从csdn读取和加工正文
* @param uriStr
* @return
*/
private static String readHtml(String uriStr){
StringBuffer sb=new StringBuffer();
sb.append("<html><head>");
try {
URI uri=new URI(uriStr);
Document doc= Jsoup.parse(uri.toURL(),10000);
sb.append(doc.select("style").outerHtml());
Elements elements=doc.select("link[rel=\"stylesheet\"]");
String url=null;
for(Element element:elements){
url=element.attr("href");
sb.append("<style type=\"text/css\" url='");
sb.append(url);
sb.append("'>");
sb.append(IOUtils.toString(new URI(url),"UTF-8"));
sb.append("</style>");
sb.append("\r\n");
}
sb.append("<style type=\"text/css\">");
sb.append("#article_content,.fontclass{font-family:\"Microsoft YaHei\"}");
sb.append("</style>");
sb.append("</head><body>");
sb.append("<h1 class='title-article'>下载地址:<b>").append(uriStr).append("</b></h1>");
doc.select("#article_content p").toggleClass("fontclass");
sb.append(doc.select(".blog-content-box").outerHtml());
sb.append("</body></html>");
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
public static boolean writeDocFile( File file, String html,String encoding) {
boolean w = false;
File fileDir=file.getParentFile();
if (!fileDir.exists()) {
fileDir.mkdirs();
}
try {
byte b[] = html.getBytes(encoding);
ByteArrayInputStream bais = new ByteArrayInputStream(b);
POIFSFileSystem poifs = new POIFSFileSystem();
DirectoryEntry directory = poifs.getRoot();
DocumentEntry documentEntry = directory.createDocument(
"WordDocument", bais);
FileOutputStream ostream = new FileOutputStream(file);
poifs.writeFilesystem(ostream);
bais.close();
ostream.close();
}catch(IOException e){
e.printStackTrace();
}
return w;
}
/**从csdn截取正文
* @param uriStr
* @throws IOException
* @throws URISyntaxException
*/
public static void writeCSDNWordFile(String uriStr,File wordFile) {
writeDocFile(wordFile,readHtml(uriStr),"UTF-8");
}
/**从csdn截取正文
* @param uriStr
* @throws IOException
* @throws URISyntaxException
*/
public static void writeCSDNHtmlFile(String uriStr,File localFile) throws IOException {
FileUtils.writeStringToFile(localFile,readHtml(uriStr),"UTF-8");
}
public static void main(String[] args) throws IOException, URISyntaxException {
String html="D:/test/word/jxl-excel.html";
String doc="D:/test/word/jxl-excel.doc";
String uri="https://blog.csdn.net/a1091662876/article/details/87722035";
writeCSDNHtmlFile(uri,new File(html));
writeCSDNWordFile(uri,new File(doc));
}
}