话不多说,兄弟们坐好。马上开冲!
爬取解析数据
1、项目导入jsoup依赖
<!--解析网页-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.4</version>
</dependency>
2、编写工具类解析网页
package com.guohui.util;
import com.guohui.po.Content;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws Exception {
new HtmlParseUtil().parseJD("java").forEach(System.out::println);
}
//抽成方法
public List<Content> parseJD(String keywords) throws Exception {
//获取请求:这里注意ajax异步请求是获取不到的
String url = "https://search.jd.com/Search?keyword=" + keywords;
//解析网页(Jsoup返回Document就是浏览器Document对象)
Document document = Jsoup.parse(new URL(url), 30000);
//所有你在js中可以使用的方法,这里都能用
Element element = document.getElementById("J_goodsList");
//获取所有的li元素
Elements elements = element.getElementsByTag("li");
ArrayList<Content> goodsList = new ArrayList<>();
//获取元素中的内容,这里el 就是每一个li标签了
for (Element el : elements) {
//关于这种图片特别多的网站,所有的图片都是延迟加载的
String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setTitle(title);
content.setImg(img);
content.setPrice(price);
goodsList.add(content);
}
return goodsList;
}
}
编写业务层
业务层代码希望大家能够认真阅读!
import com.alibaba.fastjson2.JSON;
import com.guohui.po.Content;
import com.guohui.service.ContentService;
import com.guohui.util.HtmlParseUtil;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@Service
@Slf4j
public class ContentServiceImpl implements ContentService {
@Autowired
private HtmlParseUtil htmlParseUtil;
@Autowired
private RestHighLevelClient restHighLevelClient;
/**
* @Author xgh
* @Description 解析数据放入ES
* @Date 2023/7/23 16:09
* @Return
**/
@Override
public boolean parseContent(String keyWorld) throws Exception {
//调用封装的解析页面的工具类,获得页面上的li集合
List<Content> contents = htmlParseUtil.parseJD(keyWorld);
log.info("解析的数据为:{}", contents.toString());
//批量添加文档
BulkRequest bulkRequest = new BulkRequest();
for (int i = 0; i < contents.size(); i++) {
log.info("数据为:{}", contents.get(i));
bulkRequest.add(new IndexRequest("jd_goods")
.source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
}
//批量执行请求
BulkResponse responses = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !responses.hasFailures();
}
/**
* @Author xgh
* @Description 获取数据后搜索数据
* @Date 2023/7/23 16:39
* @Return
**/
@Override
public List<Map<String, Object>> searchData(String keyWord, int pageNo, int pageSize) throws IOException {
//分页的判断
if (pageNo <= 1) {
pageNo = 1;
}
//创建查询的请求
SearchRequest searchRequest = new SearchRequest("jd_goods");
//构建查询条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//通过工具来构建构建查询条件
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyWord);
//调用query方法实现精准查询
searchSourceBuilder.query(termQueryBuilder);
//分页
searchSourceBuilder.from(pageNo);
searchSourceBuilder.size(pageSize);
//设置查询的超时时间
searchSourceBuilder.timeout(new TimeValue(66, TimeUnit.SECONDS));
//封装搜索
searchRequest.source(searchSourceBuilder);
//调用客户端来查询
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析查询的结果
List<Map<String, Object>> mapList = new ArrayList<>();
for (SearchHit documentFields : searchResponse.getHits().getHits()) {
//调用getSourceAsMap方法将查询出的结果转换成map,然后将map添加到List集合中!
mapList.add(documentFields.getSourceAsMap());
}
return mapList;
}
/**
* @Author xgh
* @Description 高亮查询
* @Date 2023/7/23 17:13
* @Return
**/
@Override
public List<Map<String, Object>> searchDataHighLight(String keyWord, int pageNo, int pageSize) throws IOException {
//分页的判断
if (pageNo <= 1) {
pageNo = 1;
}
//创建查询的请求
SearchRequest searchRequest = new SearchRequest("jd_goods");
//构建查询条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//通过工具来构建构建查询条件
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyWord);
//调用query方法实现精准查询
searchSourceBuilder.query(termQueryBuilder);
//配置高亮!!!!!!
HighlightBuilder highlightBuilder = new HighlightBuilder();
//当存在多个相同的字段的时候,关闭多个高亮,只高亮一个地方即可
highlightBuilder.requireFieldMatch(false);
//设置高亮的字段
highlightBuilder.field("title");
//设置高亮的颜色
highlightBuilder.preTags("<span style='color=red'>");
highlightBuilder.postTags("</span>");
searchSourceBuilder.highlighter(highlightBuilder);
//分页
searchSourceBuilder.from(pageNo);
searchSourceBuilder.size(pageSize);
//设置查询的超时时间
searchSourceBuilder.timeout(new TimeValue(66, TimeUnit.SECONDS));
//封装搜索
searchRequest.source(searchSourceBuilder);
//调用客户端来查询
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析查询的结果
List<Map<String, Object>> mapList = new ArrayList<>();
for (SearchHit documentFields : searchResponse.getHits().getHits()) {
//解析高亮的字段
Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();
HighlightField title = highlightFields.get("title");
Map<String, Object> sourceAsMap = documentFields.getSourceAsMap(); //这是原来的查询结果
//将高亮的字段把原来结果的字段替换掉即可!!!!!!
if (title != null){
Text[] fragments = title.fragments();
String new_title = "";
for (Text text : fragments) {
new_title += text;
}
//将高亮的字段替换掉原来的字段
sourceAsMap.put("title",new_title);
}
mapList.add(sourceAsMap);
}
return mapList;
}
}
编写API层
import com.guohui.service.ContentService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@RestController
public class ContentApi {
@Autowired
private ContentService contentService;
/**
* @Author xgh
* @Description 解析页面数据,存储到ES中
* @Date 2023/7/23 16:58
* @Return
**/
@GetMapping("/parse/{keyWord}")
public Boolean parse(@PathVariable("keyWord") String keyWord) throws Exception {
return contentService.parseContent(keyWord);
}
/**
* @Author xgh
* @Description 查询ES数据
* @Date 2023/7/23 16:58
* @Return
**/
@GetMapping("/search/{keyWord}/{pageNo}/{pageSize}")
public List<Map<String,Object>> searchData(@PathVariable("keyWord") String keyWord,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize") int pageSize) throws IOException {
return contentService.searchData(keyWord,pageNo,pageSize);
}
/**
* @Author xgh
* @Description 高亮查询ES数据
* @Date 2023/7/23 16:58
* @Return
**/
@GetMapping("/searchHigh/{keyWord}/{pageNo}/{pageSize}")
public List<Map<String,Object>> searchDataHighLight(@PathVariable("keyWord") String keyWord,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize") int pageSize) throws IOException {
return contentService.searchDataHighLight(keyWord,pageNo,pageSize);
}
}
调用API响应结果
1、调用爬取数据并存储到ES的方法
观察ES的head可视化工具中是否存入了跟ES相关的数据
2、直接查询数据
3、调用高亮查询
最后需要前端通过下面的标签将html语言进行解析即可!!!!!!
至此,关于ES中爬取页面数据实现文章检索功能已经介绍完毕,详细的使用希望大家能够认真阅读各个层的代码,注释也很详细的哈!
后续还会持续的更新,希望打大家可以继续关注~