参考其他博主的文章,并在原来的基础上添加了博客园文章下载的方式,本来想引用一下原文的,但是发现找不到了,后面找到了再把原博客链接补上
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
/**
*
* Jsoup解析网页
* maven依赖
* <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
* <dependency>
* <groupId>org.jsoup</groupId>
* <artifactId>jsoup</artifactId>
* <version>1.13.1</version>
* </dependency>
* <!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
* <dependency>
* <groupId>org.apache.poi</groupId>
* <artifactId>poi</artifactId>
* <version>4.1.2</version>
* </dependency>
* <!-- https://mvnrepository.com/artifact/org.apache.directory.studio/org.apache.commons.io -->
* <dependency>
* <groupId>org.apache.directory.studio</groupId>
* <artifactId>org.apache.commons.io</artifactId>
* <version>2.4</version>
* </dependency>
*
*/
public class JsoupDownload {
public static void main(String[] args) throws IOException, URISyntaxException {
//下载地址
String download="F:/upload/xxxx.html";
// String doc="F:/upload/jxl-excel.doc";
//CSDN网页url
String uri="https://blog.csdn.net/Sun_Loading/article/details/125892377";
//下载
writeCSDNHtmlFile(uri,new File(download));
//writeCSDNWordFile(uri,new File(doc));
//博客园下载
//String b_uri="https://www.cnblogs.com/glb79809-glb/p/14365629.html";
//writeBKYHtmlFile(b_uri,new File(download));
//下载doc
//String doc="F:/upload/nodejs的版本管理工具nvm1.docx";
//writeBKYWordFile(b_uri,new File(doc));
}
public static void test1() throws IOException {
Document document = Jsoup.connect("http://www.baidu.com").get();
System.out.println(document.title());
}
/**从csdn截取正文
* @param uriStr
* @throws IOException
*/
public static void writeCSDNHtmlFile(String uriStr,File localFile) throws IOException {
FileUtils.writeStringToFile(localFile,readHtml(uriStr),"UTF-8");
}
/**从bky截取正文
* @param uriStr
* @throws IOException
*/
public static void writeBKYHtmlFile(String uriStr,File localFile) throws IOException {
FileUtils.writeStringToFile(localFile,readHtmlBKY(uriStr),"UTF-8");
}
/**从csdn截取正文
* @param uriStr
* @throws IOException
*/
public static void writeCSDNWordFile(String uriStr,File wordFile) {
writeDocFile(wordFile,readHtml(uriStr),"UTF-8");
}
/**从博客园截取正文
* @param uriStr
* @throws IOException
*/
public static void writeBKYWordFile(String uriStr,File wordFile) {
writeDocFile(wordFile,readHtmlBKY(uriStr),"UTF-8");
}
/**
* 从csdn读取和加工正文
* @param uriStr
* @return
*/
private static String readHtml(String uriStr){
StringBuffer sb=new StringBuffer();
sb.append("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">");
try {
URI uri=new URI(uriStr);
Document doc= Jsoup.parse(uri.toURL(),10000);
//解决页面不能复制的问题
Elements style = doc.select("style");
style.remove(0);
sb.append(style.outerHtml());
Elements elements=doc.select("link[rel=\"stylesheet\"]");
String url=null;
for(Element element:elements){
url=element.attr("href");
sb.append("<style type=\"text/css\" url='");
sb.append(url);
sb.append("'>");
sb.append(IOUtils.toString(new URI(url),"UTF-8"));
sb.append("</style>");
sb.append("\r\n");
}
sb.append("<style type=\"text/css\">");
sb.append("#article_content,.fontclass{font-family:\"Microsoft YaHei\"}");
sb.append("</style>");
sb.append("</head><body style=\"padding: 100px\">");
sb.append("<h1 class='title-article'>下载地址:<b>").append(uriStr).append("</b></h1>");
doc.select("#article_content p").toggleClass("fontclass");
sb.append(doc.select(".blog-content-box").html());
sb.append("</body></html>");
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
/**
* 博客园
* @param uriStr
* @return
*/
private static String readHtmlBKY(String uriStr){
StringBuffer sb=new StringBuffer();
sb.append("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">");
try {
URI uri=new URI(uriStr);
Document doc= Jsoup.parse(uri.toURL(),10000);
sb.append(doc.select("style").outerHtml());
Elements elements=doc.select("link[rel=\"stylesheet\"]");
String url=null;
for(Element element:elements){
url=element.attr("href");
sb.append("<style type=\"text/css\" url='");
sb.append(url);
sb.append("'>");
sb.append(IOUtils.toString(new URI("https://www.cnblogs.com"+url),"UTF-8"));
sb.append("</style>");
sb.append("\r\n");
}
sb.append("<style type=\"text/css\">");
sb.append("#article_content,.fontclass{font-family:\"Microsoft YaHei\"}");
sb.append("</style>");
sb.append("</head><body style=\"padding: 100px\">");
sb.append("<h1 class='title-article'>下载地址:<b>").append(uriStr).append("</b></h1>");
doc.select("#article_content p").toggleClass("fontclass");
sb.append(doc.select("#topics").outerHtml());
sb.append("</body></html>");
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
public static boolean writeDocFile( File file, String html,String encoding) {
boolean w = false;
File fileDir=file.getParentFile();
if (!fileDir.exists()) {
fileDir.mkdirs();
}
try {
byte b[] = html.getBytes(encoding);
ByteArrayInputStream bais = new ByteArrayInputStream(b);
POIFSFileSystem poifs = new POIFSFileSystem();
DirectoryEntry directory = poifs.getRoot();
DocumentEntry documentEntry = directory.createDocument(
"WordDocument", bais);
FileOutputStream ostream = new FileOutputStream(file);
poifs.writeFilesystem(ostream);
bais.close();
ostream.close();
}catch(IOException e){
e.printStackTrace();
}
return w;
}
}
csdn下载效果
博客园下载效果