遍历网站的所有Url

最新推荐文章于 2024-05-02 04:48:44 发布

iteye_10647

最新推荐文章于 2024-05-02 04:48:44 发布

阅读量1.3k

点赞数

文章标签： WebForm J# Web

网站的url分为很多种类：<a href="" />; <form action="" method="Get"/>;<link href=""/>;<img src=""/>;<script src=""/> ;<frame src=""/> 等等

难点：
递归遍历
获得页面每个url
同时请求（每种类型的请求方式都不同）
有些链接是重复的，需要去重

使用 Htmlparse 工具下载htmlparser.jar
遍历 + 通过htmlparser 解析页面元素

public class Urll {

	// 定义的全局变量
	public static Vector<String> svecOutUrl = new Vector<String>();
	public static Vector<String> svecBadUrl = new Vector<String>();
	public static Vector<String> svecAUrl = new Vector<String>();
	public static final int DEEP=3; //遍历的深度
	public static boolean bl; //判断标志
	private static String loc;
	private static Parser parser; //对超文本进行分析


	private static String hostName = "sina.com";





	// 由于网站中URL之间的连接构成了图，所以对图的遍历这里采用深度优先的方法。
	public static void extractLinks(String loc) throws Exception {

		String str1;
		URL wwwurl;
		boolean byes;

		Vector<String> vecUrl=new Vector<String>();

		// 解析 <a>
		try {
			parser = new Parser(loc); //原理见HTMLParser
			bl=true;
		}
		catch (Exception e) {
			bl=false;
			e.printStackTrace();
		}

		filterStr = "a";
        filter = new TagNameFilter(filterStr);
        links = parser.extractAllNodesThatMatch(filter); 
		for (int i = 0;i < links.size();i++) {
			if(bl)
			{
				byes=true;
				LinkTag LinkTag = (LinkTag)links.elementAt(i);
				str1= LinkTag.getLink();
				System.out.println(""+i);
                                str1 = Patter (str1)
				if(str1.equals("")) continue;
				if(!svecAUrl.contains(str1))
				{
					try
					{
						//　判断是否可连接
						wwwurl=new URL(str1);
						URLConnection con = wwwurl.openConnection();
						con.setConnectTimeout(1000);
						con.getInputStream();
					}
					catch(SocketTimeoutException e)
					{
						byes=false;
						svecBadUrl.add(str1);
						continue;
					}
					catch(Exception e)
					{
						byes=false;
						continue;
					}
					if(GetHostName(str1).equals(hostName))
					{
						svecAUrl.add(str1);
						vecUrl.add(str1);
					}
					else
					{
						svecOutUrl.add(str1);
					}
				}
			}
		}



		//	递归调用
		String strNew;
		int b = 1;
		if(b<=DEEP)
		{

			for(int i=0;i<vecUrl.size();i++)
			{
				strNew=(String)vecUrl.get(i);
				extractLinks(strNew); 
			}
		}

	}


	// 通过该函数来判断所得URL是否是本网站的URL
	public static String GetHostName(String host)
	{
		URL aurl;
		String ss=" ";
		try
		{
			aurl=new URL(host);
			ss=aurl.getHost();
			ss = ss.substring(ss.length()-10, ss.length());
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
		return ss;
	}

｝

去重需要使用正则表达式

	private String Patter (String str) {

		if (str.indexOf("http:") == -1) {
			return str = "";
		}

		Pattern	p = Pattern.compile("http://www.sina.com/\\d+/v/\\d+.html");
		Matcher  m = p.matcher(str);
		boolean  b = m.matches();
		if (b) {
			str = "http://www.sina.com/0/v/0.html";
			return str;
		}
}

二、用htmlparse 可以对 a ,link,script,img 元素获取，但无法解决对form的递归提交，因为form提交方式分为get,post两种，对post方式参数列表无法获取，无法动态设置post提交方式。
使用HttpUnit测试工具很好强大的模拟浏览器，可以任意提交，页面元素也可以获得。
下载引入 httpunit.rar

	
	private static WebConversation wc = new WebConversation();	
	private static WebForm w;

// 由于网站中URL之间的连接构成了图，所以对图的遍历这里采用深度优先的方法。
	public static void extractLinks(WebRequestSource webT,String method,boolean start) throws Exception {

		Vector<WebForm> vecForm=new Vector<WebForm>();
		Vector<WebLink> vecLink=new Vector<WebLink>();
		WebResponse resp = null;
		WebForm[] webForm = new WebForm[0];
		WebLink[] webLink = new WebLink[0];
		try {
			HttpUnitOptions.setExceptionsThrownOnScriptError(false);
			// 按照 Get Post link 类型打开web

			if (start) {
				// 首页
				WebRequest req = new PostMethodWebRequest("http://www.sina.com/");
				resp = wc.getResponse(req);
			} else if ("post".equals(method) || "get".equals(method)) {
                               //获得form 并提交
				WebForm w = (WebForm) webT;
				[color=red]resp = w.submit();[/color]
			} else {
				WebLink l = (WebLink) webT;
				[color=red]resp = l.click();[/color]
			}
			webForm = resp.getForms();
			webLink = resp.getLinks();

			bl=true;
		} catch (Exception e) {
			bl=false;
			e.printStackTrace();
		}

		String ss,str1;
		URL wwwurl;
		boolean byes;
		StringBuffer strUrl;
		int a=0,b=0,tID=0;
		b++;




		// 获取一个页面中所有的FORM中URL
		for (int i = 0;i < webForm.length;i++) {
			if(bl) {
				byes=true;
				// 按照 Get Post 类型 
				strUrl = new StringBuffer(resp.getURL().toString());

				if (!"./".equals(webForm[i].getAction()) && "post".equals(webForm[i].getMethod())) {
					strUrl.append(webForm[i].getAction().substring(1, webForm[i].getAction().length()));
					strUrl.append("?");
					String[] para = webForm[i].getParameterNames();
					for (int p = 0;p< para.length;p++) {
						strUrl.append(para[p]);
						strUrl.append("=&");
					}
				} else if (!"./".equals(webForm[i].getAction())) {
					strUrl.append(webForm[i].getAction().substring(1, webForm[i].getAction().length()));
				}

				if(strUrl.equals("")) continue;

				if(!svecLink.contains(strUrl.toString())) {
					try {
						// 按照 Get Post 类型 
						if (!"./".equals(webForm[i].getAction())) {
							webForm[i].submit();
						}
					} catch(Exception e) {
						byes=false;
					}
					if(GetHostName(strUrl.toString()).equals(hostName) && byes){
						a++;
						tID++;
						svecLink.add(strUrl.toString());
						// 按照 Get Post 类型 
						vecForm.add(webForm[i]);
					} else {
						svecOutlink.add(strUrl.toString());
					}

					if (svecLink.size() >= 1000) {
						svecLink.clear();
					}
				}
			}
		}

		// 获取一个页面中所有的LINK中URL
		for (int i = 0;i < webLink.length;i++) {
			if(bl) {
				byes=true;
				// 按照 Link 类型 
				strUrl = new StringBuffer(webLink[i].getURLString());

				if (strUrl.indexOf("http") == -1) {
					strUrl = new StringBuffer();
				}
				if(strUrl == null || "".equals(strUrl.toString())) continue;

				if(!svecLink.contains(strUrl.toString())) {
					try {
							webLink[i].newScriptable();
							HttpUnitOptions.clearScriptErrorMessages();
							HttpUnitOptions.setExceptionsThrownOnScriptError(false);
							HttpUnitOptions.setScriptingEnabled(false);
							HttpUnitOptions.setJavaScriptOptimizationLevel(0);
							WebRequest re = webLink[i].getRequest();
							URL u = re.getURL();
							u.getContent();
							// 按照 Link 类型 
					} catch(Exception e) {
						byes=false;
						System.out.print(e.getMessage());
					}
					if(GetHostName(strUrl.toString()).equals(hostName) && byes){
						a++;
						tID++;
						svecLink.add(strUrl.toString());
						// 按照 Link 类型 
						vecLink.add(webLink[i]);
					} else {
						svecOutlink.add(strUrl.toString());
					}

					if (svecLink.size() >= 1000) {
						svecLink.clear();
					}
				}
			}
		}


		WebForm webFNew;
		WebLink webLNew;
		if(a>0&&b<=DEEP) {

			//	递归调用
			for(int i=0,j=0;i<vecForm.size()||j<vecLink.size();i++,j++) {
				webFNew = (WebForm)vecForm.get(i);
				extractLinks(webFNew,webFNew.getMethod().toString(),false); 

				webLNew = (WebLink)vecLink.get(j);
				extractLinks(webLNew,"link".toString(),false); 

			}
		}

	}





	// 通过该函数来判断所得URL是否是本网站的URL，如果不是就不需要添加svecLink中如果是并且以前没有提取过就添加到svecLink中。
	public static String GetHostName(String host) {
		URL aurl;
		String ss=" ";
		try {
			aurl=new URL(host);
			ss=aurl.getHost();
			ss = ss.substring(ss.length()-10, ss.length());
		} catch(Exception e) {
			e.printStackTrace();
		}
		return ss;
	}

｝

对于不符合链接格式的都会无法请求也就是坏链接。

iteye_10647

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
遍历网站的所有Url

网站的url分为很多种类：; ;;; ; 等等难点：递归遍历获得页面每个url同时请求（每种类型的请求方式都不同）有些链接是重复的，需要去重使用 Htmlparse 工具下载htmlparser.jar 遍历 + 通过htmlparser 解析页面元素[code="java"]public class Urll { // 定义的全局变量 pub...
复制链接

扫一扫