public static void main(String[] args)
{
//找出网页中所有超连接的href内容
InputStream is = CommonUtil.getStream("http://cto.csdn.net/");
String regex = "(?<=href=[/"|/'])[^/"|/']*";
byte[] bytes = new byte[1024];
FileWriter fw = null;
FileReader fr = null;
String str;
try
{
fw = new FileWriter(new File("d:/Temp.txt"), false);
while(is.read(bytes)>0)
{
str = new String(bytes, "UTF-8");
if(CommonUtil.patternText(str,regex))
{
// System.out.println(str);
fw.write(CommonUtil.getPatternStr(str, regex));
}
}
is.close();
fw.flush();
}
catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
finally
{
if (fw != null)
{
try
{
fw.close();
}
catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
/**
* 获取HTTP连接成功后的流对象
* @param path
* @return
*/
public static InputStream getStream(String path)
{
InputStream is=null;
for(int i=0;i<5;i++)
{
if(is==null)
{
try
{
is=getURLContent(path);
}
catch (Exception e)
{
continue;
}
}
else
{
break;
}
}
return is;
}
/**
* 根据指定的正则表达式验证指定的文本内容
* @param content
* @return
* @throws UnsupportedEncodingException
*/
public static boolean patternText(String text,String regex)
{
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(text);
return matcher.find();
}
/**
* 获取匹配的字符串
* @param text
* @param regex
* @return
*/
public static String getPatternStr(String text ,String regex)
{
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(text);
StringBuffer sb=new StringBuffer();
while(matcher.find())
{
sb.append(matcher.group());
sb.append("/n");
}
return sb.toString();
}