package grabtest;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.Scanner;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
/**
* 抓取网页数据工具类
* @author asus-pc
*
*/
public class GrabUrl {
public static String getUrlText(String url) throws Exception{
URL getUrl=new URL(url);
HttpURLConnection connection=(HttpURLConnection)getUrl.openConnection();
connection.connect();
BufferedReader reader=new BufferedReader(new InputStreamReader(connection.getInputStream(),"utf-8"));
StringBuffer buffer=new StringBuffer();
String lines=null;
while ((lines=reader.readLine())!=null){
lines=new String(lines.getBytes(),"utf-8");
buffer=buffer.append(lines+"\n");
}
reader.close();
connection.disconnect();
return buffer.toString();
}
private static String extractText(Node node){
if(node instanceof TextNode){
return ((TextNode) node).text();
}
List<Node> children = node.childNodes();
StringBuffer buffer = new StringBuffer();
for (Node child: children) {
buffer.append(extractText(child));
}
return buffer.toString();
}
public static String html2Str(String html){
Document doc = Jsoup.parse(html);
return extractText(doc);
}
/**测试*/
public static void main(String[] args) {
try {
System.out.println("请输入网址:");
Scanner scanner=new Scanner(System.in);
String urlString=scanner.next();
String aString=getUrlText(urlString);
String aSt=html2Str(aString);
System.out.println(aSt);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
需要用到jsoup的jar包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>