1.Jsoup概念
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
2.应用场景
适用于数据抓取解析工作,尤其是类似于Jquery选择器可以快速定位要获取的数据片段。
3.例子
如下,实现一个解析一周天气的数据,数据由网页抓取得到,需要提取相关的气温、风速、风向等信息。
Maven地址:
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.9.2</version> </dependency>
String url = "http://www.weather.com.cn/weather/101210101.shtml"; Document doc = Jsoup.connect(url) .data("query", "Java") .userAgent("Mozilla") .cookie("auth", "token"). timeout(3000) .get(); Elements elems = doc.select("#7d>ul>li"); List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(); for( Element el : elems){ /** * "procewind": "东风", "wind": "3-4级", "when": "白天", "temp": "8℃/12°C", "weather": "小雨", "procedure": "" */ String date = el.select("h1").html(); String procewind = el.select(".win span").attr("title"); String wind = el.select(".win>i").html(); String temp = el.select(".tem span").html(); temp += "/" + el.select(".tem>i").html(); String weather = el.select(".wea").html(); String procedure = ""; Map<String, Object> map = new HashMap<String, Object>(); map.put("date", date); List<Object> parts = new ArrayList<Object>(); { Map<String, Object> mapDay = new HashMap<String, Object>(); mapDay.put("procewind", procewind); mapDay.put("wind", wind); mapDay.put("when", "白天"); mapDay.put("temp", temp); mapDay.put("weather", weather); mapDay.put("procedure", procedure); Map<String, Object> mapNight = new HashMap<String, Object>(); mapNight.put("procewind", procewind); mapNight.put("wind", wind); mapNight.put("when", "夜晚"); mapNight.put("temp", temp); mapNight.put("weather", weather); mapNight.put("procedure", procedure); parts.add(mapDay); parts.add(mapNight); } map.put("parts", parts); dataList.add(map); }
or 正则表达式
String reg = "<div id=\"7d\".*?<ul class=\"t clearfix\">.*?"; // String each ="<li.*?sky.*?>.*?<h1>(?<pdate>.*?)</h1>.*?wea\">(?<pweather>.*?)</p>.*?tem\">\\s+(?<ptem>.*?(?<ptem1>\\d+).*?(?<ptem2>\\d+)+.*?)</p>.*?win\">.*?title=\"(?<pwindspeed>.*?)\".*?<i>(?<pwinddir>.*?)</i>.*?</li>.*?"; String each = "<li.*?sky.*?>.*?<h1>(.*?)</h1>.*?wea\">(.*?)</p>.*?tem\">\\s+(.*?)</p>.*?win\">.*?title=\"(.*?)\".*?<i>(.*?)</i>.*?</li>.*?";
4.参考链接