java网络爬虫——获取页面的所有超链接的内容

最新推荐文章于 2021-08-20 22:44:31 发布

ld191474639

最新推荐文章于 2021-08-20 22:44:31 发布

阅读量1.3w

点赞数

本文链接：https://blog.csdn.net/ld191474639/article/details/7989772

版权

package com.http3;

import java.util.ArrayList;
import java.util.List;


import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;

import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * 提取具有某个属性值的标签列表
 * @author Administrator
 *
 */

public class Attrbuite {
public static <T extends TagNode> List<T> getText(String html,final Class<?> tagType,final String attrName,final String attrValue){
	try{
		Parser p=new Parser();
		p.setInputHTML(html);
		NodeList list=p.parse(new NodeFilter(){

			public boolean accept(Node node) {
				if(node.getClass()==tagType){
					T t=(T)node;
					if(attrName==null){
						return true;
					}
					String attrValue=t.getAttribute("luanma:"+attrName);
					System.out.println(attrValue);
					if(attrValue!=null&&attrValue.equals(attrValue)){
						return true;
					}
				}
				return false;
			}
			
		}
		
		);
	List<T> tags=new ArrayList<T>();
	for(int i=0;i<list.size();i++){
		T tt=(T)list.elementAt(i);
		tags.add(tt);
	}
	return tags;
	}catch(Exception e){
		e.printStackTrace();
	}return null;
}
public static <T extends TagNode> List<T> getText(String html,final Class<?> tagType){
return getText( html,  tagType,null,null);
}
}

main类

package com.http4;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.tags.LinkTag;

import com.http3.*;
public class Url {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		try{
			HttpClient http=new DefaultHttpClient();
			//http.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost("172.17.18.84",8080));
			
			HttpGet hg=new HttpGet("http://www.sohu.com/");
			HttpResponse hr=http.execute(hg);
			HttpEntity he=hr.getEntity();//鍝堝搱
			if(he!=null){
				String charset=EntityUtils.getContentCharSet(he);
				InputStream is=he.getContent();
				BufferedReader br=new BufferedReader(new InputStreamReader(is,"GBK"));
				String line=null;
				//IOUtils.copy(is,new FileOutputStream("E:/Baidu.html"));
			while((line=br.readLine())!=null){
					List<LinkTag> link=Attrbuite.getText(line, LinkTag.class);
					for(LinkTag l:link){
						System.out.println(l.getStringText());
					}
				}
				is.close();
			}
			http.getConnectionManager().shutdown();
		}catch(Exception e){
			e.printStackTrace();
		}

	}

}

ld191474639

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
3
评论
java网络爬虫——获取页面的所有超链接的内容

package com.http3;import java.util.ArrayList;import java.util.List;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.no
复制链接

扫一扫