import
java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net. * ;
import java.io. * ;
import java.util.regex. * ;
/*
根据指定的规则,通过构造正则表达式获取网址
*/
public class Urls
{
private String startUrl; // 开始采集网址
String urlContent;
String ContentArea;
private String strAreaBegin ,strAreaEnd ; // 采集区域开始采集字符串和结束采集字符串
private String stringInUrl,stringNotInUrl;
String strContent; // 获得的采集内容
String[] allUrls; // 采集到的所有网址
private String regex; // 采集规则
public static void main(String[] args)
{
Urls myurl = new Urls( " <body " , " /body> " );
myurl.getStartUrl( " http://www.baidu.com/ " );
myurl.getUrlContent();
myurl.getContentArea();
myurl.getStringInUrl( " http://www.baidu.com/ " );
myurl.getStringNotInUrl( " google " );
myurl.Urls();
// System.out.println("startUrl:"+myurl.startUrl);
// System.out.println("urlcontent:"+myurl.urlContent);
// System.out.println("ContentArea:"+myurl.ContentArea);
}
// 初始化构造函数 strAreaBegin 和strAreaEnd
public Urls (String strAreaBegin,String strAreaEnd)
{
this .strAreaBegin = strAreaBegin;
this .strAreaEnd = strAreaEnd;
}
//
public void Urls()
{
int i = 0 ;
// String regex ="<a href="?'?http: // [a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
String regex = " <a.*?/a> " ;
// String regex ="http: // .*?>";
Pattern pt = Pattern.compile(regex);
Matcher mt = pt.matcher(ContentArea);
while (mt.find())
{
System.out.println(mt.group());i ++ ;
}
System.out.println( " 共有 " + i + " 个符合结果 " );
}
// 获得开始采集网址
public void getStartUrl(String startUrl)
{
this .startUrl = startUrl;
}
// 获得网址所在内容;
public void getUrlContent()
{
StringBuffer is = new StringBuffer();
try
{
URL myUrl = new URL(startUrl);
BufferedReader br = new BufferedReader(
new InputStreamReader(myUrl.openStream()));
String s;
while ((s = br.readLine()) != null )
{
is.append(s);
}
urlContent = is.toString();
}
catch (Exception e)
{
System.out.println( " 网址文件未能输出 " );
e.printStackTrace();
}
}
// 获得网址所在的匹配区域部分
public void getContentArea()
{
int pos1 = 0 ,pos2 = 0 ;
pos1 = urlContent.indexOf(strAreaBegin) + strAreaBegin.length();
pos2 = urlContent.indexOf(strAreaEnd,pos1);
ContentArea = urlContent.substring(pos1,pos2);
}
// 以下两个函数获得网址应该要包含的关键字及不能包含的关键字
// 这里只做初步的实验。后期,保护的关键字及不能包含的关键字应该是不只一个的。
public void getStringInUrl(String stringInUrl)
{
this .stringInUrl = stringInUrl;
}
public void getStringNotInUrl(String stringNotInUrl)
{
this .stringNotInUrl = stringNotInUrl;
}
// 获取采集规则
// 获取url网址
public void getUrl()
{
}
public String getRegex()
{
return regex;
}
}
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net. * ;
import java.io. * ;
import java.util.regex. * ;
/*
根据指定的规则,通过构造正则表达式获取网址
*/
public class Urls
{
private String startUrl; // 开始采集网址
String urlContent;
String ContentArea;
private String strAreaBegin ,strAreaEnd ; // 采集区域开始采集字符串和结束采集字符串
private String stringInUrl,stringNotInUrl;
String strContent; // 获得的采集内容
String[] allUrls; // 采集到的所有网址
private String regex; // 采集规则
public static void main(String[] args)
{
Urls myurl = new Urls( " <body " , " /body> " );
myurl.getStartUrl( " http://www.baidu.com/ " );
myurl.getUrlContent();
myurl.getContentArea();
myurl.getStringInUrl( " http://www.baidu.com/ " );
myurl.getStringNotInUrl( " google " );
myurl.Urls();
// System.out.println("startUrl:"+myurl.startUrl);
// System.out.println("urlcontent:"+myurl.urlContent);
// System.out.println("ContentArea:"+myurl.ContentArea);
}
// 初始化构造函数 strAreaBegin 和strAreaEnd
public Urls (String strAreaBegin,String strAreaEnd)
{
this .strAreaBegin = strAreaBegin;
this .strAreaEnd = strAreaEnd;
}
//
public void Urls()
{
int i = 0 ;
// String regex ="<a href="?'?http: // [a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
String regex = " <a.*?/a> " ;
// String regex ="http: // .*?>";
Pattern pt = Pattern.compile(regex);
Matcher mt = pt.matcher(ContentArea);
while (mt.find())
{
System.out.println(mt.group());i ++ ;
}
System.out.println( " 共有 " + i + " 个符合结果 " );
}
// 获得开始采集网址
public void getStartUrl(String startUrl)
{
this .startUrl = startUrl;
}
// 获得网址所在内容;
public void getUrlContent()
{
StringBuffer is = new StringBuffer();
try
{
URL myUrl = new URL(startUrl);
BufferedReader br = new BufferedReader(
new InputStreamReader(myUrl.openStream()));
String s;
while ((s = br.readLine()) != null )
{
is.append(s);
}
urlContent = is.toString();
}
catch (Exception e)
{
System.out.println( " 网址文件未能输出 " );
e.printStackTrace();
}
}
// 获得网址所在的匹配区域部分
public void getContentArea()
{
int pos1 = 0 ,pos2 = 0 ;
pos1 = urlContent.indexOf(strAreaBegin) + strAreaBegin.length();
pos2 = urlContent.indexOf(strAreaEnd,pos1);
ContentArea = urlContent.substring(pos1,pos2);
}
// 以下两个函数获得网址应该要包含的关键字及不能包含的关键字
// 这里只做初步的实验。后期,保护的关键字及不能包含的关键字应该是不只一个的。
public void getStringInUrl(String stringInUrl)
{
this .stringInUrl = stringInUrl;
}
public void getStringNotInUrl(String stringNotInUrl)
{
this .stringNotInUrl = stringNotInUrl;
}
// 获取采集规则
// 获取url网址
public void getUrl()
{
}
public String getRegex()
{
return regex;
}
}