1. 抓取数据
1.1 导入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
1.2 分析京东
进入京东,搜索java
京东前端的商品列表,都在这个div中:
1.3 测试
public static void main(String[] args) throws IOException {
String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=b3fb38cde42140b491ea1d8fcb7e1c94";
Connection connection = Jsoup.connect(url);
伪造请求头
connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
connection.header("accept-encoding", "gzip, deflate, br");
connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");
connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");
connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");
connection.header("sec-ch-ua-mobile", "?0");
connection.header("sec-fetch-dest", "document");
connection.header("sec-fetch-mode", "navigate");
connection.header("sec-fetch-site", "same-origin");
connection.header("sec-fetch-user", "?1");
connection.header("upgrade-insecure-requests", "1");
connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");
发起请求
Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();
String body = response.body();
解析网页
Document document = Jsoup.parse(body);
System.out.println(document);
Element elt = document.getElementById("J_goodsList");
System.out.println(elt.html());
}
1.4 抓取数据
public static void main(String[] args) throws IOException {
String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=b3fb38cde42140b491ea1d8fcb7e1c94";
Connection connection = Jsoup.connect(url);
伪造请求头
connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
connection.header("accept-encoding", "gzip, deflate, br");
connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");
connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");
connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");
connection.header("sec-ch-ua-mobile", "?0");
connection.header("sec-fetch-dest", "document");
connection.header("sec-fetch-mode", "navigate");
connection.header("sec-fetch-site", "same-origin");
connection.header("sec-fetch-user", "?1");
connection.header("upgrade-insecure-requests", "1");
connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");
发起请求
Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();
String body = response.body();
解析网页
Document document = Jsoup.parse(body);
Element elt = document.getElementById("J_goodsList");
Elements lis = elt.getElementsByTag("li");
lis.forEach(li -> {
String price = li.getElementsByClass("p-price").get(0).getElementsByTag("i").text();
String name = li.getElementsByClass("p-name").get(0).text();
String img = li.getElementsByTag("img").attr("src");
System.out.println("================================================================");
System.out.println(name);
System.out.println(price);
System.out.println(img);
});
}
注意,结果中没有图片的src,因为图片采用了“延迟加载”
改进:
public static void main(String[] args) throws IOException {
String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=b3fb38cde42140b491ea1d8fcb7e1c94";
Connection connection = Jsoup.connect(url);
伪造请求头
connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
connection.header("accept-encoding", "gzip, deflate, br");
connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");
connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");
connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");
connection.header("sec-ch-ua-mobile", "?0");
connection.header("sec-fetch-dest", "document");
connection.header("sec-fetch-mode", "navigate");
connection.header("sec-fetch-site", "same-origin");
connection.header("sec-fetch-user", "?1");
connection.header("upgrade-insecure-requests", "1");
connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");
发起请求
Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();
String body = response.body();
解析网页
Document document = Jsoup.parse(body);
Element elt = document.getElementById("J_goodsList");
Elements lis = elt.getElementsByTag("li");
lis.forEach(li -> {
String price = li.getElementsByClass("p-price").get(0).getElementsByTag("i").text();
String name = li.getElementsByClass("p-name").get(0).text();
String img = li.getElementsByTag("img").attr("src");
if (img == null || img.equals("")) {
img = li.getElementsByTag("img").attr("data-lazy-img");
}
System.out.println("================================================================");
System.out.println(name);
System.out.println(price);
System.out.println(img);
});
}
1.5 封装成工具类
创建实体类
@Data
public class ProductDTO {
private String name;
private String img;
private String price;
}
username password
将抓取数据的逻辑,封装到一个工具类中,并让spring管理这个工具类
@Component
public class HtmlUtils {
public static void main(String[] args) throws IOException {
List<ProductDTO> list = new HtmlUtils().fetchFromJD("java");
list.forEach(System.out::println);
}
private String req(String url) throws IOException {
Connection connection = Jsoup.connect(url);
伪造请求头
connection.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
connection.header("accept-encoding", "gzip, deflate, br");
connection.header("accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6");
connection.header("referer", "https://search.jd.com/Search?keyword=linux&enc=utf-8&wq=linux&pvid=ba772e43adc143a29f4874c5237d4f1d");
connection.header("sec-ch-ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"90\", \"Google Chrome\";v=\"90\"");
connection.header("sec-ch-ua-mobile", "?0");
connection.header("sec-fetch-dest", "document");
connection.header("sec-fetch-mode", "navigate");
connection.header("sec-fetch-site", "same-origin");
connection.header("sec-fetch-user", "?1");
connection.header("upgrade-insecure-requests", "1");
connection.header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");
发起请求
Connection.Response response = connection.ignoreContentType(true).method(Connection.Method.GET).execute();
String body = response.body();
return body;
}
public List<ProductDTO> fetchFromJD(String keyword) throws IOException {
List<ProductDTO> list = new ArrayList<>();
String url = "https://search.jd.com/Search?keyword=" + keyword + "&enc=utf-8&wq=&pvid=0d06d5f1a3d74686b524156e571a740a";
String body = req(url);
Document document = Jsoup.parse(body);
Element elt = document.getElementById("J_goodsList");
Elements lis = elt.getElementsByTag("li");
lis.forEach(li -> {
String price = li.getElementsByClass("p-price").get(0).getElementsByTag("i").text();
String name = li.getElementsByClass("p-name").get(0).text();
String img = li.getElementsByTag("img").attr("src");
if (img == null || img.equals("")) {
img = li.getElementsByTag("img").attr("data-lazy-img");
}
ProductDTO dto = new ProductDTO();
dto.setName(name);
dto.setImg(img);
dto.setPrice(price);
list.add(dto);
});
return list;
}
}
2. 数据入ES库
将从京东抓取的数据,索引到索引库中:
@Service
public class ProductService {
@Autowired
private RestHighLevelClient client;
@Autowired
private HtmlUtils htmlUtils;
public boolean indexProductFromJD(String keywords) throws IOException {
从京东抓取数据
List<ProductDTO> productDTOList = htmlUtils.fetchFromJD(keywords);
将数据索引到索引中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
productDTOList.forEach(productDTO -> {
bulkRequest.add(new IndexRequest("jd_products").source(JSON.toJSONString(productDTO),
XContentType.JSON));
});
发送请求
BulkResponse response = client.bulk(bulkRequest, RequestOptions.DEFAULT);
return !response.hasFailures();
}
}
编写IndexController
@RestController
public class IndexController {
@Autowired
private ProductService productService;
@GetMapping("index/{keywords}")
public String index(@PathVariable String keywords) throws IOException {
boolean b = productService.indexProductFromJD(keywords);
return b + "";
}
}
启动SpringBoot应用
@SpringBootApplication
public class App {
public static void main(String[] args) {
SpringApplication.run(App.class, args);
}
}
测试:
在kibana中,也可以看到抓取到的数据:
3. 获取数据
在ProductService中,再添加一个方法,从索引库中获取数据,返回给前端
public List<ProductDTO> searchPage(String keyword, int start, int size) throws IOException {
if (start < 1) {
start = 1;
}
/*
GET jd_products/_search
{
"query":{
"term": {
“name”: "java"
}
},
"from": 0,
"size": 2
}
*/
条件搜索
SearchRequest searchRequest = new SearchRequest("jd_products");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
分页
sourceBuilder.from(start);
sourceBuilder.size(size);
精准匹配
sourceBuilder.query(QueryBuilders.termQuery("name", keyword));
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
把条件传入给本次请求
searchRequest.source(sourceBuilder);
SearchResponse response = client.search(searchRequest, RequestOptions.DEFAULT);
获取结果
List<ProductDTO> list = new ArrayList<>();
SearchHits hits = response.getHits();
hits.forEach(hit -> {
Map<String, Object> map = hit.getSourceAsMap();
ProductDTO dto = new ProductDTO();
dto.setPrice((String) map.get("price"));
dto.setName((String) map.get("name"));
dto.setImg((String) map.get("img"));
list.add(dto);
});
return list;
}
在IndexController再暴露一个端点
@GetMapping("/search/{keyword}/{start}/{size}")
public List<ProductDTO> search(@PathVariable String keyword, @PathVariable int start, @PathVariable int size) throws IOException {
return productService.searchPage(keyword, start, size);
}
测试:
4. 前端
略