import java.io.IOException; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.tags.Div; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class TT { /** * @param args * @throws IOException * @throws HttpException * @throws ParserException */ public static void main(String[] args) throws HttpException, IOException, ParserException { String resource = getContent("http://www.dianping.com/shop/1968937"); getReview(resource); } public static String getContent(String url) throws HttpException, IOException { HttpClient hc=new HttpClient(); GetMethod gm=new GetMethod(url); hc.getParams().setParameter(HttpMethodParams.USER_AGENT,"Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2");//设置信息 hc.executeMethod(gm); return gm.getResponseBodyAsString(); } public static void getReview(String resource) throws ParserException { Parser myParser = new Parser(resource); NodeList nodeList = null; //myParser.setEncoding("gb2312"); NodeFilter divFilter = new NodeClassFilter(Div.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { divFilter }); try { int count = 0; nodeList = myParser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof Div) { Div div = (Div) nodeList.elementAt(i); String id = div.getAttribute("id"); if (id != null && id.startsWith("review_")) { System.out.println("--------------------------------" + ++count); String content = div.getChildrenHTML(); content = content.replaceAll("//<p>.*</p>", "") .replaceAll("<span.*</span>", "") .replaceAll("<br/>", "/n") .replaceAll(" ", " "); System.out.println(content); } } } } catch (ParserException e) { e.printStackTrace(); } } }