爬取网页数据的demo
1.引入依赖jsoup
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2.代码测试
public List<Map<String,String>> getData() throws Exception {
//爬取的网页
String url = "https://search.jd.com/Search?keyword=java";
//document 相当于是个js 直接操作js一样 必须要联网
Document document = Jsoup.parse(new URL(url), 3000);
Element element = document.getElementById("J_goodsList");
Elements elements = element.getElementsByTag("li");
List<Map<String,String>> list = new ArrayList<>();
for (Element element1 : elements) {
String price = element1.getElementsByClass("p-price").eq(0).text();
String title = element1.getElementsByClass("p-name").eq(0).text();
//懒加载的方式 所以调用data-lazy-img这个属性,而不是 src
String image = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");
Map<String,String> map = new HashMap<>();
map.put("price",price);
map.put("title",title);
map.put("image",image);
Content content = new Content();
content.setImg(image);
content.setPrice(price);
content.setTitle(title);
list.add(map);
}
return list;
}