京东案例
1、导入页面
2、创建jd_goods索引
3、编写业务逻辑
爬取数据
public static List<Content> parseJD(String keywords) throws IOException {
String url = "https://search.jd.com/Search?keyword="+keywords;
//解析网页
Document document = Jsoup.parse(new URL(url), 30000);
//所有你在js中可以使用的方法,这都有
Element element = document.getElementById("J_goodsList");
//获取所有的li元素
Elements elements = element.getElementsByTag("li");
//获取元素中的内容,这里el就是每一个li标签了
ArrayList<Content> goodsList = new ArrayList<>();
for (Element el:elements){
String img = el.getElementsByTag("img").eq(0).attr("src");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setImg(img);
content.setPrice(price);
content.setTitle(title);
goodsList.add(content);
}
return goodsList;
}
@Override
public Boolean parseContent(String keywords) throws IOException {
List<Content> contentList = HtmlParseUtil.parseJD(keywords);
//吧查询到的数据放入es中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
for (int i=0;i<contentList.size();i++){
ObjectMapper mapper = new ObjectMapper();
if (contentList.get(i).getImg()==""||contentList.get(i).getImg()==null){
continue;
}
String content = mapper.writeValueAsString(contentList.get(i));
bulkRequest.add(
new IndexRequest("jd_goods")
.source(content, XContentType.JSON));
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
System.out.println(bulkResponse.status());
return !bulkResponse.hasFailures();
}
查询的接口
@Override
public List<Map<String, Object>> searchPage(String keyword, int pageNo, int pageSize) throws IOException {
if (pageNo<=1){
pageNo = 1;
}
//条件搜索
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//分页
sourceBuilder.from(pageNo);
sourceBuilder.size(pageSize);
//精准匹配
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//执行搜索
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析结果
ArrayList<Map<String,Object>> list = new ArrayList<>();
for (SearchHit documentFields:searchResponse.getHits().getHits()){
list.add(documentFields.getSourceAsMap());
}
return list;
}
高亮查询接口
@Override
public List<Map<String, Object>> highLightSearch(String keyword, int pageNo, int pageSize) throws IOException {
if (pageNo<=1){
pageNo = 1;
}
//条件搜索
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//分页
sourceBuilder.from(pageNo);
sourceBuilder.size(pageSize);
//精准匹配
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//高亮
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("title");
highlightBuilder.requireFieldMatch(false); //多个高亮显示
highlightBuilder.preTags("<span style='color:red'>");
highlightBuilder.postTags("</span>");
sourceBuilder.highlighter(highlightBuilder);
//执行搜索
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析结果
ArrayList<Map<String,Object>> list = new ArrayList<>();
for (SearchHit documentFields:searchResponse.getHits().getHits()){
Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();//获取高亮部分title
HighlightField title = highlightFields.get("title"); //获取title内容
Map<String, Object> sourceAsMap = documentFields.getSourceAsMap();//原来的结果
//解析高亮的字段,将原来的字段换成我们高亮的字段即可
if (title!=null){
Text[] fragments = title.fragments();//提取高亮内容
String n_title = "";
for (Text text:fragments){ //
n_title += text; //将高亮部分替换给n_title
}
sourceAsMap.put("title", n_title); //高亮的字段替换原来的内容
}
list.add(sourceAsMap);
}
return list;
}