案例demo
使用的是jsoup.jar包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
public class HtmlParseUtil {
public void test1() throws IOException {
//可以通过字符串拼接的方法,自定义爬取商品
//"https://search.jd.com/Search?keyword="+搜索的商品
String url = "https://search.jd.com/Search?keyword=java";
Document parse = Jsoup.parse(new URL(url), 30000);
Element elementById = parse.getElementById("J_goodsList");
// System.out.println(elementById);
Elements li = elementById.getElementsByTag("li");
for (Element el: li) {
String img = el.getElementsByTag("img").eq(0).attr("src");
String price = el.getElementsByClass("p-price").eq(0).text().replaceAll("¥","");
String title = el.getElementsByClass("curr-shop hd-shopname").eq(0).attr("title");
String s = el.getElementsByClass("promo-words").eq(0).text();
String em = el.getElementsByClass("p-name").eq(0).text();
System.out.println(img);
System.out.println(title);
System.out.println(em.replaceAll(s,""));
System.out.println(price);
System.out.println("========================");
}
}
public static void main(String[] args) throws IOException {
test();
}
}
爬取的数据就是解析出html , 根据html中的id , class , div 属性进行筛选爬取