java采集网页数据。获取html文本节点
第一种: 采用HttpURLConnection
packagecom.yjf.util;
importjava.io.BufferedReader;
importjava.io.IOException;
importjava.io.InputStream;
importjava.io.InputStreamReader;
importjava.net.HttpURLConnection;
importjava.net.URL;
publicclassHttpWebUtil {
publicstaticString GetWebContent(String urlString,finalString charset,inttimeout)throwsIOException {
if(urlString ==null|| urlString.length() ==0) {
return"";
}
urlString = (urlString.startsWith("http://") || urlString.startsWith("https://")) ? urlString : ("http://"+ urlString).intern();
URL url =newURL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setDoOutput(true);
conn.setRequestProperty("Pragma","no-cache");
conn.setRequestProperty("Cache-Control","no-cache");
inttemp = Integer.parseInt(Math.round(Math.random()*(UserAgent.length-1))+"");
conn.setRequestProperty(
"User-Agent",
UserAgent[temp]);// 模拟手机系统
conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,**任意,就是tomcat/conf/web里面定义那些
conn.setConnectTimeout(timeout);
try{
if(conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
return"";
}
}catch(Exception e) {
try{
System.out.println(e.get