使用jsoup爬html

最新推荐文章于 2023-09-12 17:05:11 发布

chouke4165

最新推荐文章于 2023-09-12 17:05:11 发布

阅读量134

点赞数

文章标签： java

原文链接：https://my.oschina.net/jhy168/blog/1928465

版权

一、使用jsoup-1.8.1.jar（链接：http://jsoup.org/packages/jsoup-1.8.1.jar）

二、代码如下：

1、方法一：只使用jsoup，不使用正则过滤

package xiaoshuo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

public class XiaoShuoZhaQu {

   public static void main(String[] args) throws IOException {

           String strurl;
           int num =174;
           for (int i = 0; i < 36; i++) {
           num+=1;
           System.out.println(num);
           URL url = new URL("http://www.51shucheng.net/kehuan/santi/santi1/"+num+".html");
           URLConnection openConnection = url.openConnection();//返回一个URLConnection实例，表示与URL引用的远程对象的URL 。

           HttpURLConnection httpcon = (HttpURLConnection)openConnection;
           int responseCode = httpcon.getResponseCode();//从HTTP响应消息获取状态代码。
           if(responseCode==httpcon.HTTP_OK){//如果连接成功，响应200
               InputStream inputStream = httpcon.getInputStream();//返回从此打开的连接读取的输入流。
               InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
               BufferedReader in = new BufferedReader(inputStreamReader); //从字符输入流读取文本，缓冲字符，以提供字符，数组和行的高效读取。
               String abc;

               StringBuilder str = new StringBuilder();
               while((abc=in.readLine()) != null){
                   str.append(abc);

               /* abc = abc.replaceAll("\\<p>|</p>","");*/
               }
               String sss = str.toString();
               Document parse = Jsoup.parse(sss);


               Elements eles=parse.getElementsByTag("h1");
               String title = eles.text();
               Elements elementsByClass = parse.getElementsByClass("neirong");
               String ddsa = elementsByClass.toString();
               String textw =Jsoup.clean(ddsa, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
               System.out.println(textw);

               wenjian wenjian = new wenjian();
               wenjian.createDir(title, textw);


               in.close();

           }

           }

   }

}

2、方法二：使用jsoup和正则过滤

package xiaoshuo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

public class XiaoShuoZhaQu {

public static void main(String\[\] args) throws IOException {   

         String strurl;  
         int num =173;  
         for (int i = 0; i < 36; i++) {   
         num+=1;   
         System.out.println(num);   
         URL url = new URL("http://www.51shucheng.net/kehuan/santi/santi1/"+num+".html");   
         URLConnection openConnection = url.openConnection();//返回一个URLConnection实例，表示与URL引用的远程对象的URL 。  

        HttpURLConnection httpcon = (HttpURLConnection)openConnection;  
        int responseCode = httpcon.getResponseCode();//从HTTP响应消息获取状态代码。   
        if(responseCode==httpcon.HTTP_OK){//如果连接成功，响应200  
            InputStream inputStream = httpcon.getInputStream();//返回从此打开的连接读取的输入流。   
            InputStreamReader inputStreamReader = new InputStreamReader(inputStream);  
             BufferedReader in = new BufferedReader(inputStreamReader); //从字符输入流读取文本，缓冲字符，以提供字符，数组和行的高效读取。  
             String abc;   

             StringBuilder str = new StringBuilder();  
             while((abc=in.readLine()) != null){  
                 str.append(abc);  

            /\* abc =  abc.replaceAll("\\\<p>|</p>","");*/      
             }  
             String sss = str.toString();  
             Document parse = Jsoup.parse(sss);   


             Elements eles=parse.getElementsByTag("h1");  
             String title = eles.text();    
             Elements elementsByClass = parse.getElementsByClass("neirong");  
             String ddsa = elementsByClass.toString();   

             ddsa =  ddsa.replaceAll("\\\<p>|</p>","");  
             ddsa = ddsa.replaceAll("<script\[^>\]*?>\[\\\s\\\S\]*?<\\\/script>", "");  
             ddsa = ddsa.replaceAll("<ins\[^>\]*?>\[\\\s\\\S\]*?<\\\/ins>", "");  


             StringBuffer sub =     new StringBuffer(ddsa);  
             int lastIndexOf = sub.indexOf(">");  
             String ssdsd = ddsa.substring(lastIndexOf+1);  
             int sada = ssdsd.indexOf("<");  

             String wedwerw = ssdsd.substring(0, sada);  
            /\* String textw =Jsoup.clean(ddsa, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));   
             System.out.println(textw); */  

             wenjian wenjian = new wenjian();  
             wenjian.createDir(title, wedwerw );  


             in.close();   

        }  

         }  

}

3.方法一与方法二公用：输出doc格式的word文件到桌面

package xiaoshuo;

import java.io.File;
import java.io.FileOutputStream;

public class wenjian {
/**

* @param paths //文件路径（路径+文件名）
* @param sourceString //待写入字符串
*/

public static void createDir(String path,String sourceString ) {
```
 String paths = "C:\\\Users\\\Anny\\\Desktop\\\"+path+".doc";  
```
/*String sourceString = "sourceString";*/ //待写入字符串
byte[] sourceByte = sourceString.getBytes();
if(null != sourceByte){
try {
File file = new File(paths); //文件路径（路径+文件名）
if (!file.exists()) { //文件不存在则创建文件，先创建目录
String parent = file.getParent();
File dir = new File(parent);
dir.mkdirs();
file.createNewFile();
}
FileOutputStream outStream = new FileOutputStream(file); //文件输出流用于将数据写入文件
outStream.write(sourceByte);
outStream.close(); //关闭文件输出流
} catch (Exception e) {
e.printStackTrace();
}
}
}