`package jsoup;
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
public class test { static String URL = "https://www.dianping.com/shop";
public static void main(String[] args) throws Exception { // TODO Auto-generated method stub // test01(); // tiaozhuanlianjie(); // bokeneirong(); dazhong(); }
public static void test01() throws Exception { Document doc = Jsoup.connect(URL).get(); // Elements ListDiv = doc.select(".expand-info address"); Elements ListDiv = doc.getElementsByAttributeValue("class", "expand-info address"); for (Element element : ListDiv) { Elements dizhi = element.getElementsByTag("span"); for (Element dizhi2 : dizhi) { System.out.println(dizhi2.html()); } } Elements dianhua = doc.getElementsByAttributeValue("class", "expand-info tel"); for (Element element : dianhua) { Elements dianhua2 = element.getElementsByTag("span"); for (Element element2 : dianhua2) { System.out.println(element2.html()); } } }
public static void tiaozhuanlianjie() throws Exception { Document doc = Jsoup.connect("https://www.dianping.com/search/keyword/5/0_%E7%82%B8%E9%B8%A1").get();
Elements elements = doc.select("#shop-all-list>ul li");
for (Element element : elements) {
String attr = element.select(".txt").select("div.tit>a").attr("abs:href");
System.out.println(attr);
}
}
public static void zhihu() throws Exception { Document doc = Jsoup.connect("https://www.zhihu.com/explore").get(); // Elements ListDiv = doc.getElementsByAttributeValue("class", "zu-main-content"); // for (Element element : ListDiv) { // Elements dizhi = element.getElementsByTag("a"); // for (Element element2 : dizhi) { // System.out.println(element2.attr("abs:href")); // } // } Elements elements = doc.select(".tab-panel").get(0).select("div div.explore-feed.feed-item"); for (Element element : elements) { String url = element.select("h2>a").attr("abs:href"); System.out.println(url); } }
public static void zhihu2() throws Exception { Document doc = Jsoup.connect("https://www.zhihu.com/topic").get(); Elements elements = doc.select(".zh-general-list.clearfix div.feed-item.feed-item-hook.folding"); for (Element element : elements) { String url = element.select("div.feed-main").select("div.feed-content").select("h2>a") .attr("abs:href"); System.out.println(url); } }
public static void boke() throws Exception { Document doc = Jsoup.connect("https://www.cnblogs.com/").get(); Elements elements = doc.select("#post_list").select(".post_item").select(".post_item_body"); for (Element element : elements) { String url = element.select("h3>a").attr("abs:href"); System.out.println(url); } }
public static void bokeneirong() throws Exception { Document doc = Jsoup.connect("http://www.cnblogs.com/pjsweb/p/7122545.html").get(); String title = doc.title(); System.out.println(title); Elements elements = doc.select("#cnblogs_post_body"); for (int i = 0; i < 8; i++) { for (Element element : elements) { String neirong = element.select("p").get(i).html(); System.out.println(neirong); } } }
public static void dazhong()throws Exception{ Document doc = Jsoup.connect("https://www.dianping.com/shop/74474151").get(); Elements elements = doc.select("#basic-info"); for (Element element : elements) { System.out.println(element.select("h1").html()); } Elements elements2 = doc.select("#basic-info").select(".expand-info.tel"); for (Element element : elements2) { System.out.println(element.select("span").html()); } } } ` 利用jsoup这款xml解析器,基于dom树解析原理来实现html数据抓取