Jsoup:使用Java将爬虫得到的数据写入Excel,Jsoup得到的数据进行持久化,爬虫数据保存到本地Excel中
一、资源
二、代码
-
xml依赖
<dependency> <groupId>com.alibaba</groupId> <artifactId>easyexcel</artifactId> <version>3.0.5</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.41</version> </dependency> <!--Jsoup解析网页--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency>
-
实体类
@Data @NoArgsConstructor @AllArgsConstructor public class Content { @ExcelProperty("商品名称") private String name; @ExcelProperty("商品价格") private String price; @ExcelProperty("商品图片路径") private String img; }
-
写表工具类
@Component public class HtmlParseUtil { public static void main(String[] args) throws Exception { String fileName = "D:\\IDEA\\Jsoup\\parseJD.xlsx"; EasyExcel.write(fileName, Content.class) .sheet("Jsoup") .doWrite(new HtmlParseUtil().parseJD("java")); } public List<Content> parseJD(String keyword) throws Exception { //获取请求 https://search.jd.com/Search?keyword=java String url = "https://search.jd.com/Search?keyword=" + keyword; ArrayList<Content> contents = new ArrayList<>(); //解析网页 Document document = Jsoup.parse(new URL(url), 300000); //获取产品列表 Element element = document.getElementById("J_goodsList"); //获取产品列表中的li元素 Elements li = element.getElementsByTag("li"); //获取li里面的具体内容 for (Element el : li) { String name = el.getElementsByClass("p-name").eq(0).text(); String price = el.getElementsByClass("p-price").eq(0).text(); String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); Content content = new Content(); content.setName(name); content.setPrice(price); content.setImg(img); contents.add(content); } return contents; } }