packagecatchhtml;importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStreamReader;importjava.net.MalformedURLException;importjava.net.URL;importjava.util.ArrayList;importjava.util.List;importjava.util.regex.Matcher;importjava.util.regex.Pattern;public classGetHtmlTitle {publicGetHtmlTitle(String htmlUrl){
System.out.println("/n------------开始读取网页(" + htmlUrl + ")-----------");
String htmlSource= "";
htmlSource= getHtmlSource(htmlUrl);//获取htmlUrl网址网页的源码
System.out.println("------------读取网页(" + htmlUrl + ")结束-----------/n");
System.out.println("------------分析(" + htmlUrl + ")结果如下-----------/n");
String title=getTitle(htmlSource);
System.out.println("网站标题: " +title);
}/*** 根据网址返回网页的源码
*@paramhtmlUrl
*@return
*/
publicString getHtmlSource(String htmlUrl){
URL url;
StringBuffer sb= newStringBuffer();try{
url= newURL(htmlUrl);
BufferedReader in= new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"));//读取网页全部内容
String temp;while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
}catch(MalformedURLException e) {
System.out.println("你输入的URL格式有问题!请仔细输入");
}catch(IOException e) {
e.printStackTrace();
}returnsb.toString();
}/*** 从html源码(字符串)中去掉标题
*@paramhtmlSource
*@return
*/
publicString getTitle(String htmlSource){
List list = new ArrayList();
String title= "";//Pattern pa = Pattern.compile("
.*?", Pattern.CANON_EQ);也可以Pattern pa = Pattern.compile("
.*?");//源码中标题正则表达式Matcher ma =pa.matcher(htmlSource);while (ma.find())//寻找符合el的字串
{
list.add(ma.group());//将符合el的字串加入到list中
}for (int i = 0; i < list.size(); i++)
{
title= title +list.get(i);
}returnoutTag(title);
}/*** 去掉html源码中的标签
*@params
*@return
*/
publicString outTag(String s)
{return s.replaceAll("<.>", "");
}public static voidmain(String[] args) {
String htmlUrl= "http://www.157buy.com";newGetHtmlTitle(htmlUrl);
}
}