jsoup网络爬虫学习

`package jsoup;

import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;

public class test { static String URL = "https://www.dianping.com/shop";

public static void main(String[] args) throws Exception { // TODO Auto-generated method stub // test01(); // tiaozhuanlianjie(); // bokeneirong(); dazhong(); }

public static void test01() throws Exception { Document doc = Jsoup.connect(URL).get(); // Elements ListDiv = doc.select(".expand-info address"); Elements ListDiv = doc.getElementsByAttributeValue("class", "expand-info address"); for (Element element : ListDiv) { Elements dizhi = element.getElementsByTag("span"); for (Element dizhi2 : dizhi) { System.out.println(dizhi2.html()); } } Elements dianhua = doc.getElementsByAttributeValue("class", "expand-info tel"); for (Element element : dianhua) { Elements dianhua2 = element.getElementsByTag("span"); for (Element element2 : dianhua2) { System.out.println(element2.html()); } } }

public static void tiaozhuanlianjie() throws Exception { Document doc = Jsoup.connect("https://www.dianping.com/search/keyword/5/0_%E7%82%B8%E9%B8%A1").get();

Elements elements = doc.select("#shop-all-list>ul li");
for (Element element : elements) {
  String attr = element.select(".txt").select("div.tit>a").attr("abs:href");
  System.out.println(attr);
}

}

public static void zhihu() throws Exception { Document doc = Jsoup.connect("https://www.zhihu.com/explore").get(); // Elements ListDiv = doc.getElementsByAttributeValue("class", "zu-main-content"); // for (Element element : ListDiv) { // Elements dizhi = element.getElementsByTag("a"); // for (Element element2 : dizhi) { // System.out.println(element2.attr("abs:href")); // } // } Elements elements = doc.select(".tab-panel").get(0).select("div div.explore-feed.feed-item"); for (Element element : elements) { String url = element.select("h2>a").attr("abs:href"); System.out.println(url); } }

public static void zhihu2() throws Exception { Document doc = Jsoup.connect("https://www.zhihu.com/topic").get(); Elements elements = doc.select(".zh-general-list.clearfix div.feed-item.feed-item-hook.folding"); for (Element element : elements) { String url = element.select("div.feed-main").select("div.feed-content").select("h2>a") .attr("abs:href"); System.out.println(url); } }

public static void boke() throws Exception { Document doc = Jsoup.connect("https://www.cnblogs.com/").get(); Elements elements = doc.select("#post_list").select(".post_item").select(".post_item_body"); for (Element element : elements) { String url = element.select("h3>a").attr("abs:href"); System.out.println(url); } }

public static void bokeneirong() throws Exception { Document doc = Jsoup.connect("http://www.cnblogs.com/pjsweb/p/7122545.html").get(); String title = doc.title(); System.out.println(title); Elements elements = doc.select("#cnblogs_post_body"); for (int i = 0; i < 8; i++) { for (Element element : elements) { String neirong = element.select("p").get(i).html(); System.out.println(neirong); } } }

public static void dazhong()throws Exception{ Document doc = Jsoup.connect("https://www.dianping.com/shop/74474151").get(); Elements elements = doc.select("#basic-info"); for (Element element : elements) { System.out.println(element.select("h1").html()); } Elements elements2 = doc.select("#basic-info").select(".expand-info.tel"); for (Element element : elements2) { System.out.println(element.select("span").html()); } } } ` 利用jsoup这款xml解析器,基于dom树解析原理来实现html数据抓取

转载于:https://my.oschina.net/u/3465258/blog/1186421

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值