importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStreamReader;importjava.net.MalformedURLException;importjava.net.URL;importjava.util.ArrayList;importjava.util.HashMap;importjava.util.List;importjava.util.regex.Matcher;importjava.util.regex.Pattern;public classTestHtml {/*** 读取一个网页全部内容*/
public String getOneHtml(final String htmlurl) throwsIOException
{
URL url;
String temp;final StringBuffer sb = newStringBuffer();try{
url= newURL(htmlurl);final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));//读取网页全部内容
while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
}catch (finalMalformedURLException me)
{
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();throwme;
}catch (finalIOException e)
{
e.printStackTrace();throwe;
}returnsb.toString();
}/***
*@params
*@return获得网页标题*/
public String getTitle(finalString s)
{
String regex;
String title= "";final List list = new ArrayList();
regex= "
.*?";final Pattern pa =Pattern.compile(regex, Pattern.CANON_EQ);final Matcher ma =pa.matcher(s);while(ma.find()){
list.add(ma.group());
}for (int i = 0; i < list.size(); i++)
{
title= title +list.get(i);
}returnoutTag(title);
}/*** 获取参数
*@params
*@paramregexarg
*@return
*/
public String getByRegex(finalString s,String regexarg)
{
String regex;
String title= "";final List list = new ArrayList();
regex=regexarg;final Pattern pa =Pattern.compile(regex, Pattern.CANON_EQ);final Matcher ma =pa.matcher(s);while(ma.find())
{
list.add(ma.group());
}for (int i = 0; i < list.size(); i++)
{
title= title +list.get(i);
}returnoutTag(title);
}/***
*@paramargs
**/
public static void main(finalString args[])
{
String url= "http://detail.1688.com/offer/41797007099.html?tracelog=p4p";try{
String html= "";
TestHtml testHtml=newTestHtml();
html=testHtml.getOneHtml(url);
String Regex=".*?";
String content=testHtml.getByRegex(html, Regex);
System.out.println("contet is :"+content);
}catch (finalException e)
{
e.getMessage();
}
}
}