/**
* html内容
* @param url html地址
* @return null
*/
public String parser(String url)
{
url="http://news.baidu.com/n?cmd=1&class=civilnews&tn=rss&sub=0";
String parse = null;
try
{
String content;
// String title ;
content = getOneHtml(url);
// title = getTitle(content);
parse = "\n" + getTab(getLink(getScript(getCSS(content))));
parse = getText(parse);
}
catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
return parse;
}
/* *//**
* @param args
*//*
public static void main(String[] args)
{
ParserHtml ph = new ParserHtml();
* String content = ""; String title = ""; String parse=""; try {
* content = parser(); title = getTitle(content);
* parse=getTab(getLink(getScript(getCSS(content))));
* System.out.println(parse); parserText(parse); //
* System.out.println(getCSS(content)); //System.out.println("title:" +
* title); } catch (IOException e) { // TODO Auto-generated catch block
* e.printStackTrace(); }
ph.parser("D:\\aa.html");
}*/
/**
* 取得标题
* @param s 内容
* @return null
*/
public static String getTitle(final String s)
{
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<title>.*?</title>";// 取得标题的正则表达式
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
StringBuffer sBuffer = new StringBuffer();
while (ma.find())
{
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++)
{
// title = title + list.get(i);
sBuffer.append(list.get(i));
}
title = sBuffer.toString();
return outTag(title);
}
/**
* 去掉所有的html标记
* @param s 内容
* @return null
*/
public static String outTag(final String s)
{
return s.replaceAll("<.*?>", "");// 去掉所有的html标记
}
/**
* 去掉所有的html样式
* @param s 内容
* @return null
*/
public static String getCSS(final String s)
{
String regex;
String outCss = s;
regex = "(<style|<STYLE).*?(</style>|<STYLE>)";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
outCss = outCss.replace(ma.group(), "");
}
return outCss;
}
/**
* 去掉所有的script脚本
* @param s 内容
* @return null
*/
public static String getScript(final String s)
{
String outScript = s;
String regex;
// regex = "<script.*?</script>";
regex = "(<script|<SCRIPT).*?(</script>|</SCRIPT>)";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
outScript = outScript.replace(ma.group(), "");
}
return outScript;
}
/**
* 去掉所有的html标记
* @param s 内容
* @return null
*/
public static String getTab(final String s)
{
String outScript = s;
String regex;
regex = "<.*?>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
outScript = outScript.replace(ma.group(), "");
// outScript=outScript.replace(" ", "");
}
return outScript;
}
/**
* 去掉所有的<a>标签
* @param s 内容
* @return null
*/
public static String getLink(final String s)
{
String outScript = s;
String regex;
regex = "(<a|<A).*?(</a>|</A>)";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
outScript = outScript.replace(ma.group(), "");
// outScript=outScript.replace(" ", "");
}
return outScript;
}
/**
* 读取一个网页全部内容
* @param htmlurl htmlurl
* @return null
* @throws IOException IOException
*/
public String getOneHtml(final String htmlurl) throws IOException
{
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try
{
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
}
catch (final MalformedURLException me)
{
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
}
catch (final IOException e)
{
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 根据空格和文字长度,过滤非正文部分的文字
* @param s 内容
* @return null
*/
public String getText(String s)
{
String[] array = s.split(" ");
String str = "";
String str2 = "";
StringBuffer sBuffer = new StringBuffer();
for (int i = 0; i < array.length; i++)
{
str2 = array[i].trim();
if (str2.length() > Com.NUM_40)
{
// str += Array[i];
sBuffer.append(array[i]);
// System.out.println(str);
}
}
str = sBuffer.toString();
return str;
}