package cn.gurong.gurongproduction.util;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BaseHtmlParser {
// 判断网址是否有效
public static boolean isExit(String urlstr) {
URL url;
try {
url = new URL(urlstr);
HttpURLConnection c = (HttpURLConnection) url.openConnection();
c.setRequestMethod("HEAD");
String message = c.getResponseMessage();
if (message.compareTo("Not Found") == 0) {
return false;
}
c.disconnect();
} catch (Exception e) {
return false;
}
return true;
}
/**
*
* @param str
* @return
*/
public static String replace(String str)
{
String goal=null;
String str1="<strong>更多及时资讯关注新浪证券微博</strong>";
if (str!=null) {
String regex="\\s{2,}|\t|\r|\n|</br>|</BR>|<BR>|</br>|<iframe.*?>(.*?)</iframe>|<A.*?>|</A>|<a.*?>|</a>";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
goal= m.replaceAll("");
}
return goal;
}
}
HtmlParser爬虫
/**
* 获取腾讯网站上的研究报告类型文章的内容;
* @param url
* @return content,返回文章的内容
*/
public static String getContent(String url){
String content=null;
try {
URLConnection conn = (new URL(url)).openConnection();
conn.setConnectTimeout(1000 * 60 * 2);
Parser parser = new Parser(conn);
parser.setEncoding("gbk");
NodeFilter filter = new HasAttributeFilter("class", "ArticleCnt");
NodeList nodes = parser.extractAllNodesThatMatch(filter);
if (nodes!=null) {
Node node=nodes.elementAt(0);
if (node!=null) {
content=node.toHtml();
}
}
} catch (Exception e) {
e.printStackTrace();
}
return content;
}