1. 抓取数据


1.1 导入依赖

<dependency>

<groupId>org.jsoup</groupId>

<artifactId>jsoup</artifactId>

<version>1.10.2</version>

</dependency>


1.2 分析京东

进入京东,搜索java

7. 实战_java



京东前端的商品列表,都在这个div中:

7. 实战_html_02



1.3 测试

public static void main(String[] args) throws IOException {

String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=b3fb38cde42140b491ea1d8fcb7e1c94";

Connection connection = Jsoup.connect(url);

伪造请求头

connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");

connection.header("accept-encoding", "gzip, deflate, br");

connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");

connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");

connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");

connection.header("sec-ch-ua-mobile", "?0");

connection.header("sec-fetch-dest", "document");

connection.header("sec-fetch-mode", "navigate");

connection.header("sec-fetch-site", "same-origin");

connection.header("sec-fetch-user", "?1");

connection.header("upgrade-insecure-requests", "1");

connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");


发起请求

Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();

String body = response.body();

解析网页

Document document = Jsoup.parse(body);


System.out.println(document);

Element elt = document.getElementById("J_goodsList");


System.out.println(elt.html());


}


7. 实战_System_03



1.4 抓取数据

public static void main(String[] args) throws IOException {

String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=b3fb38cde42140b491ea1d8fcb7e1c94";

Connection connection = Jsoup.connect(url);

伪造请求头

connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");

connection.header("accept-encoding", "gzip, deflate, br");

connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");

connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");

connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");

connection.header("sec-ch-ua-mobile", "?0");

connection.header("sec-fetch-dest", "document");

connection.header("sec-fetch-mode", "navigate");

connection.header("sec-fetch-site", "same-origin");

connection.header("sec-fetch-user", "?1");

connection.header("upgrade-insecure-requests", "1");

connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");


发起请求

Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();

String body = response.body();

解析网页

Document document = Jsoup.parse(body);

Element elt = document.getElementById("J_goodsList");

Elements lis = elt.getElementsByTag("li");

lis.forEach(li -> {

String price = li.getElementsByClass("p-price").get(0).getElementsByTag("i").text();

String name = li.getElementsByClass("p-name").get(0).text();

String img = li.getElementsByTag("img").attr("src");


System.out.println("================================================================");

System.out.println(name);

System.out.println(price);

System.out.println(img);

});

}


注意,结果中没有图片的src,因为图片采用了“延迟加载”

7. 实战_System_04



改进:

public static void main(String[] args) throws IOException {

String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=b3fb38cde42140b491ea1d8fcb7e1c94";

Connection connection = Jsoup.connect(url);

伪造请求头

connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");

connection.header("accept-encoding", "gzip, deflate, br");

connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");

connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");

connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");

connection.header("sec-ch-ua-mobile", "?0");

connection.header("sec-fetch-dest", "document");

connection.header("sec-fetch-mode", "navigate");

connection.header("sec-fetch-site", "same-origin");

connection.header("sec-fetch-user", "?1");

connection.header("upgrade-insecure-requests", "1");

connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");


发起请求

Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();

String body = response.body();

解析网页

Document document = Jsoup.parse(body);

Element elt = document.getElementById("J_goodsList");

Elements lis = elt.getElementsByTag("li");

lis.forEach(li -> {

String price = li.getElementsByClass("p-price").get(0).getElementsByTag("i").text();

String name = li.getElementsByClass("p-name").get(0).text();

String img = li.getElementsByTag("img").attr("src");


if (img == null || img.equals("")) {

img = li.getElementsByTag("img").attr("data-lazy-img");

}


System.out.println("================================================================");

System.out.println(name);

System.out.println(price);

System.out.println(img);

});

}


7. 实战_html_05



1.5 封装成工具类

创建实体类

@Data

public class ProductDTO {

private String name;

private String img;

private String price;

}

username password

将抓取数据的逻辑,封装到一个工具类中,并让spring管理这个工具类

@Component

public class HtmlUtils {


public static void main(String[] args) throws IOException {

List<ProductDTO> list = new HtmlUtils().fetchFromJD("java");

list.forEach(System.out::println);

}


private String req(String url) throws IOException {

Connection connection = Jsoup.connect(url);

伪造请求头

connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");

connection.header("accept-encoding", "gzip, deflate, br");

connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");

connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");

connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");

connection.header("sec-ch-ua-mobile", "?0");

connection.header("sec-fetch-dest", "document");

connection.header("sec-fetch-mode", "navigate");

connection.header("sec-fetch-site", "same-origin");

connection.header("sec-fetch-user", "?1");

connection.header("upgrade-insecure-requests", "1");

connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");


发起请求

Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();

String body = response.body();

return body;

}


public List<ProductDTO> fetchFromJD(String keyword) throws IOException {

List<ProductDTO> list = new ArrayList<>();

String url = "https://search.jd.com/Search?keyword=" + keyword + "&enc=utf-8&wq=&pvid=0d06d5f1a3d74686b524156e571a740a";

String body = req(url);

Document document = Jsoup.parse(body);

Element elt = document.getElementById("J_goodsList");

Elements lis = elt.getElementsByTag("li");

lis.forEach(li -> {

String price = li.getElementsByClass("p-price").get(0).getElementsByTag("i").text();

String name = li.getElementsByClass("p-name").get(0).text();

String img = li.getElementsByTag("img").attr("src");


if (img == null || img.equals("")) {

img = li.getElementsByTag("img").attr("data-lazy-img");

}

ProductDTO dto = new ProductDTO();

dto.setName(name);

dto.setImg(img);

dto.setPrice(price);

list.add(dto);

});

return list;

}


}



2. 数据入ES库

将从京东抓取的数据,索引到索引库中:

@Service

public class ProductService {


@Autowired

private RestHighLevelClient client;


@Autowired

private HtmlUtils htmlUtils;


public boolean indexProductFromJD(String keywords) throws IOException {

从京东抓取数据

List<ProductDTO> productDTOList = htmlUtils.fetchFromJD(keywords);

将数据索引到索引中

BulkRequest bulkRequest = new BulkRequest();

bulkRequest.timeout("2m");

productDTOList.forEach(productDTO -> {

bulkRequest.add(new IndexRequest("jd_products").source(JSON.toJSONString(productDTO),

XContentType.JSON));

});

发送请求

BulkResponse response = client.bulk(bulkRequest, RequestOptions.DEFAULT);


return !response.hasFailures();

}

}


编写IndexController

@RestController

public class IndexController {


@Autowired

private ProductService productService;


@GetMapping("index/{keywords}")

public String index(@PathVariable String keywords) throws IOException {

boolean b = productService.indexProductFromJD(keywords);

return b + "";

}

}


启动SpringBoot应用

@SpringBootApplication

public class App {

public static void main(String[] args) {

SpringApplication.run(App.class, args);

}

}


测试:

7. 实战_java_06



在kibana中,也可以看到抓取到的数据:

7. 实战_System_07




3. 获取数据


在ProductService中,再添加一个方法,从索引库中获取数据,返回给前端

public List<ProductDTO> searchPage(String keyword, int start, int size) throws IOException {

if (start < 1) {

start = 1;

}

/*

GET jd_products/_search

{

"query":{

"term": {

“name”: "java"

}

},

"from": 0,

"size": 2

}

*/

条件搜索

SearchRequest searchRequest = new SearchRequest("jd_products");

SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();


分页

sourceBuilder.from(start);

sourceBuilder.size(size);


精准匹配

sourceBuilder.query(QueryBuilders.termQuery("name", keyword));

sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));


把条件传入给本次请求

searchRequest.source(sourceBuilder);


SearchResponse response = client.search(searchRequest, RequestOptions.DEFAULT);


获取结果

List<ProductDTO> list = new ArrayList<>();

SearchHits hits = response.getHits();

hits.forEach(hit -> {

Map<String, Object> map = hit.getSourceAsMap();

ProductDTO dto = new ProductDTO();

dto.setPrice((String) map.get("price"));

dto.setName((String) map.get("name"));

dto.setImg((String) map.get("img"));

list.add(dto);

});

return list;

}


在IndexController再暴露一个端点

@GetMapping("/search/{keyword}/{start}/{size}")

public List<ProductDTO> search(@PathVariable String keyword, @PathVariable int start, @PathVariable int size) throws IOException {

return productService.searchPage(keyword, start, size);

}


测试:

7. 实战_java_08



4. 前端