一、使用jsoup-1.8.1.jar(链接:http://jsoup.org/packages/jsoup-1.8.1.jar)
二、代码如下:
1、方法一:只使用jsoup,不使用正则过滤
package xiaoshuo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
public class XiaoShuoZhaQu {
public static void main(String[] args) throws IOException {
String strurl;
int num =174;
for (int i = 0; i < 36; i++) {
num+=1;
System.out.println(num);
URL url = new URL("http://www.51shucheng.net/kehuan/santi/santi1/"+num+".html");
URLConnection openConnection = url.openConnection();//返回一个URLConnection实例,表示与URL引用的远程对象的URL 。
HttpURLConnection httpcon = (HttpURLConnection)openConnection;
int responseCode = httpcon.getResponseCode();//从HTTP响应消息获取状态代码。
if(responseCode==httpcon.HTTP_OK){//如果连接成功,响应200
InputStream inputStream = httpcon.getInputStream();//返回从此打开的连接读取的输入流。
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader in = new BufferedReader(inputStreamReader); //从字符输入流读取文本,缓冲字符,以提供字符,数组和行的高效读取。
String abc;
StringBuilder str = new StringBuilder();
while((abc=in.readLine()) != null){
str.append(abc);
/* abc = abc.replaceAll("\\<p>|</p>","");*/
}
String sss = str.toString();
Document parse = Jsoup.parse(sss);
Elements eles=parse.getElementsByTag("h1");
String title = eles.text();
Elements elementsByClass = parse.getElementsByClass("neirong");
String ddsa = elementsByClass.toString();
String textw =Jsoup.clean(ddsa, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
System.out.println(textw);
wenjian wenjian = new wenjian();
wenjian.createDir(title, textw);
in.close();
}
}
}
}
2、方法二:使用jsoup和正则过滤
package xiaoshuo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
public class XiaoShuoZhaQu {
public static void main(String\[\] args) throws IOException {
String strurl;
int num =173;
for (int i = 0; i < 36; i++) {
num+=1;
System.out.println(num);
URL url = new URL("http://www.51shucheng.net/kehuan/santi/santi1/"+num+".html");
URLConnection openConnection = url.openConnection();//返回一个URLConnection实例,表示与URL引用的远程对象的URL 。
HttpURLConnection httpcon = (HttpURLConnection)openConnection;
int responseCode = httpcon.getResponseCode();//从HTTP响应消息获取状态代码。
if(responseCode==httpcon.HTTP_OK){//如果连接成功,响应200
InputStream inputStream = httpcon.getInputStream();//返回从此打开的连接读取的输入流。
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader in = new BufferedReader(inputStreamReader); //从字符输入流读取文本,缓冲字符,以提供字符,数组和行的高效读取。
String abc;
StringBuilder str = new StringBuilder();
while((abc=in.readLine()) != null){
str.append(abc);
/\* abc = abc.replaceAll("\\\<p>|</p>","");*/
}
String sss = str.toString();
Document parse = Jsoup.parse(sss);
Elements eles=parse.getElementsByTag("h1");
String title = eles.text();
Elements elementsByClass = parse.getElementsByClass("neirong");
String ddsa = elementsByClass.toString();
ddsa = ddsa.replaceAll("\\\<p>|</p>","");
ddsa = ddsa.replaceAll("<script\[^>\]*?>\[\\\s\\\S\]*?<\\\/script>", "");
ddsa = ddsa.replaceAll("<ins\[^>\]*?>\[\\\s\\\S\]*?<\\\/ins>", "");
StringBuffer sub = new StringBuffer(ddsa);
int lastIndexOf = sub.indexOf(">");
String ssdsd = ddsa.substring(lastIndexOf+1);
int sada = ssdsd.indexOf("<");
String wedwerw = ssdsd.substring(0, sada);
/\* String textw =Jsoup.clean(ddsa, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
System.out.println(textw); */
wenjian wenjian = new wenjian();
wenjian.createDir(title, wedwerw );
in.close();
}
}
}
3.方法一与方法二公用:输出doc格式的word文件到桌面
package xiaoshuo;
import java.io.File;
import java.io.FileOutputStream;
public class wenjian {
/**
-
* @param paths //文件路径(路径+文件名)
* @param sourceString //待写入字符串
*/public static void createDir(String path,String sourceString ) {
String paths = "C:\\\Users\\\Anny\\\Desktop\\\"+path+".doc";
/*String sourceString = "sourceString";*/ //待写入字符串
byte[] sourceByte = sourceString.getBytes();
if(null != sourceByte){
try {
File file = new File(paths); //文件路径(路径+文件名)
if (!file.exists()) { //文件不存在则创建文件,先创建目录
String parent = file.getParent();
File dir = new File(parent);
dir.mkdirs();
file.createNewFile();
}
FileOutputStream outStream = new FileOutputStream(file); //文件输出流用于将数据写入文件
outStream.write(sourceByte);
outStream.close(); //关闭文件输出流
} catch (Exception e) {
e.printStackTrace();
}
}
}
}