package com.http3;
import java.util.ArrayList;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 提取具有某个属性值的标签列表
* @author Administrator
*
*/
public class Attrbuite {
public static <T extends TagNode> List<T> getText(String html,final Class<?> tagType,final String attrName,final String attrValue){
try{
Parser p=new Parser();
p.setInputHTML(html);
NodeList list=p.parse(new NodeFilter(){
public boolean accept(Node node) {
if(node.getClass()==tagType){
T t=(T)node;
if(attrName==null){
return true;
}
String attrValue=t.getAttribute("luanma:"+attrName);
System.out.println(attrValue);
if(attrValue!=null&&attrValue.equals(attrValue)){
return true;
}
}
return false;
}
}
);
List<T> tags=new ArrayList<T>();
for(int i=0;i<list.size();i++){
T tt=(T)list.elementAt(i);
tags.add(tt);
}
return tags;
}catch(Exception e){
e.printStackTrace();
}return null;
}
public static <T extends TagNode> List<T> getText(String html,final Class<?> tagType){
return getText( html, tagType,null,null);
}
}
main类
package com.http4;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.tags.LinkTag;
import com.http3.*;
public class Url {
/**
* @param args
*/
public static void main(String[] args) {
try{
HttpClient http=new DefaultHttpClient();
//http.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost("172.17.18.84",8080));
HttpGet hg=new HttpGet("http://www.sohu.com/");
HttpResponse hr=http.execute(hg);
HttpEntity he=hr.getEntity();//鍝堝搱
if(he!=null){
String charset=EntityUtils.getContentCharSet(he);
InputStream is=he.getContent();
BufferedReader br=new BufferedReader(new InputStreamReader(is,"GBK"));
String line=null;
//IOUtils.copy(is,new FileOutputStream("E:/Baidu.html"));
while((line=br.readLine())!=null){
List<LinkTag> link=Attrbuite.getText(line, LinkTag.class);
for(LinkTag l:link){
System.out.println(l.getStringText());
}
}
is.close();
}
http.getConnectionManager().shutdown();
}catch(Exception e){
e.printStackTrace();
}
}
}