目录
一、项目所需
1.1导入依赖
<properties>
<java.version>1.8</java.version>
<!-- 统一版本 -->
<elasticsearch.version>7.6.1</elasticsearch.version>
</properties>
导入elasticsearch
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
提前导入fastjson、lombok
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.70</version>
</dependency>
<!-- lombok需要安装插件 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
1.2创建并编写配置类–>连上ES
@Configuration
public class ElasticSearchConfig {
// 注册 rest高级客户端
@Bean
public RestHighLevelClient restHighLevelClient(){
RestHighLevelClient client = new RestHighLevelClient(
RestClient.builder(
new HttpHost("127.0.0.1",9200,"http")
)
);
return client;
}
}
1.3测试索引的操作
1、索引的创建
@Test
void testCreateIndex() throws IOException {
//1、创建索引请求
CreateIndexRequest request = new CreateIndexRequest("jd_goods");
//2、执行创建请求
CreateIndexResponse createIndexResponse =
restHighLevelClient.indices().create(request, RequestOptions.DEFAULT);
System.out.println("执行创建请求===>"+createIndexResponse);
}
2、测试获取索引
@Test
void testExistIndex() throws IOException {
GetIndexRequest request = new GetIndexRequest("kuang_index");
boolean exists = restHighLevelClient.indices().exists(request, RequestOptions.DEFAULT);
System.out.println("测试获取索引===>"+exists);
}
3、测试删除索引
@Test
void testDeleteIndex() throws IOException {
DeleteIndexRequest request = new DeleteIndexRequest("kuang_index");
AcknowledgedResponse delete = restHighLevelClient.indices().delete(request, RequestOptions.DEFAULT);
System.out.println("是否删除成功===>"+delete);
}
4、测试添加文档
@Test
void testAddDocument() throws IOException {
//创建对象
User user = new User("狂神说",3);
//创建请求
IndexRequest request = new IndexRequest("kuang_index");
//规则
request.id("1");
request.timeout(TimeValue.timeValueSeconds(1));
request.timeout("1s");
//将我们的数据放入请求 json
request.source(JSON.toJSONString(user), XContentType.JSON);
//客户端发送请求 获取响应的结果
IndexResponse indexResponse = restHighLevelClient.index(request, RequestOptions.DEFAULT);
System.out.println(indexResponse.toString());
System.out.println(indexResponse.status());//对应我们命令返回的状态 CREATED
}
5、获取文档 判断是否存在
@Test
void testIsExists() throws IOException {
GetRequest getRequest = new GetRequest("kuang_index","1");
//不获取返回的 _source的上下文
getRequest.fetchSourceContext(new FetchSourceContext(false));
getRequest.storedFields("_none_");
boolean exists = restHighLevelClient.exists(getRequest, RequestOptions.DEFAULT);
System.out.println(exists);
}
6、获取文档的信息
@Test
void testGetDocument() throws IOException {
GetRequest getRequest = new GetRequest("kuang_index","1");
GetResponse documentFields = restHighLevelClient.get(getRequest, RequestOptions.DEFAULT);
System.out.println(getRequest);
System.out.println(documentFields);
}
7、更新文档的信息
@Test
void testUpdateRequest() throws IOException {
UpdateRequest updateRequest = new UpdateRequest("test","1");
updateRequest.timeout("1s");
User user = new User("狂神说java",18);
updateRequest.doc(JSON.toJSONString(user),XContentType.JSON);
UpdateResponse updateResponse = restHighLevelClient.update(updateRequest, RequestOptions.DEFAULT);
System.out.println(updateResponse.status());
}
8、删除文档记录
@Test
void testDeleteRequest() throws IOException {
DeleteRequest request = new DeleteRequest("kuang_index","2");
request.timeout("1s");
DeleteResponse delete = restHighLevelClient.delete(request, RequestOptions.DEFAULT);
System.out.println(delete.status());
}
9、批量插入数据
@Test
void testBulkRequest() throws IOException {
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("10s");
ArrayList<User> userList = new ArrayList<>();
userList.add(new User("kuangshen1",3));
userList.add(new User("kuangshen2",4));
userList.add(new User("kuangshen3",5));
userList.add(new User("kuangshen4",6));
userList.add(new User("kuangshen5",13));
userList.add(new User("kuangshen6",23));
userList.add(new User("kuangshen7",33));
// 批处理请求
for (int i = 0; i < userList.size(); i++) {
bulkRequest.add(new IndexRequest("kuang_index")
.id(""+(i+1))
.source(JSON.toJSONString(userList.get(i)),XContentType.JSON));
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest,RequestOptions.DEFAULT);
System.out.println(bulkResponse.hasFailures());//是否失败 返回false 代表成功
}
10、查询
@Test
void testSearch() throws IOException {
SearchRequest searchRequest = new SearchRequest(ESconst.ES_INDEX);
//构建搜索条件
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//高亮
sourceBuilder.highlighter();
//查询条件 我们可以使用 QueryBuilders 工具来实现
//QueryBuilders.termQuery() 精确
//QueryBuilders.matchAllQuery() 匹配所有
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("name", "qinjiang1");
MatchAllQueryBuilder allQueryBuilder = QueryBuilders.matchAllQuery();
System.out.println("allQueryBuilder===>>"+allQueryBuilder);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//放入请求
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
System.out.println(JSON.toJSONString(searchResponse.getHits()));
System.out.println("=====================================");
for (SearchHit documentFields : searchResponse.getHits().getHits()) {
System.out.println(documentFields.getSourceAsMap());
}
}
二、ElasticSearch实战
2.1导入依赖
<properties>
<java.version>1.8</java.version>
<elasticsearch.version>7.6.1</elasticsearch.version>
</properties>
<dependencies>
<!-- jsoup解析页面 -->
<!-- 解析网页 爬视频可 研究tiko -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<!-- fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.70</version>
</dependency>
<!-- ElasticSearch -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<!-- thymeleaf -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-thymeleaf</artifactId>
</dependency>
<!-- web -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- devtools热部署 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<!-- -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<!-- lombok 需要安装插件 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<!-- test -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
2.2编写 application.preperties配置文件
# 更改端口,防止冲突
server.port=9999
# 关闭thymeleaf缓存
spring.thymeleaf.cache=false
2.3测试controller和view
@Controller
public class IndexController {
@GetMapping({"/","index"})
public String index(){
return "index";
}
}
三、爬虫京东的数据到ES
3.1编写Config
@Configuration
public class ElasticSearchClientConfig {
@Bean
public RestHighLevelClient restHighLevelClient(){
RestHighLevelClient client = new RestHighLevelClient(
RestClient.builder(
new HttpHost("127.0.0.1", 9200, "http")));
return client;
}
}
3.2编写service
因为是爬取的数据,那么就不走Dao,以下编写都不会编写接口,开发中必须严格要求编写
ContentService
// 1、解析数据放入 es 索引中
public Boolean parseContents(String keyword) throws Exception {
// 获取内容
List<Content> contents = new HtmlParseUtil().params(keyword);
// 内容放入 es 中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m"); // 可更具实际业务是指
for (int i = 0; i < contents.size(); i++) {
bulkRequest.add(
new IndexRequest("jd_goods")
.id(""+(i+1))
.source(JSON.toJSONString(contents.get(i)), XContentType.JSON)
);
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
restHighLevelClient.close();
return !bulk.hasFailures();
}
// 2、根据keywords分页查询结果并高亮
public List<Map<String,Object>> highlightBuilder(String keywords, int pageNo, int pageSize) throws IOException {
if(pageNo <= 1){
pageNo = 1;
}
//条件查询
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//分页
sourceBuilder.from(pageNo);
sourceBuilder.size(pageSize);
//精准匹配--因为term不会分词,而keyword也不会分词,但是IK会默认分词,所以不能在使用termQueryBuilder了
//TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keywords);
//完全配置模式
MatchPhraseQueryBuilder termQueryBuilder = QueryBuilders.matchPhraseQuery("title", keywords);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//高亮
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.requireFieldMatch(false);//多个高亮显示
highlightBuilder.field("title");
highlightBuilder.preTags("<span style='color:red'>");
highlightBuilder.postTags("</span>");
sourceBuilder.highlighter(highlightBuilder);
//执行搜索
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析结果
ArrayList<Map<String,Object>> list = new ArrayList<>();
for (SearchHit documentFields : searchResponse.getHits().getHits()) {
Map<String, HighlightField> fields = documentFields.getHighlightFields();
HighlightField title = fields.get("title");
Map<String, Object> sourceAsMap = documentFields.getSourceAsMap();//原来的结果
//解析高亮的字段
if(title!=null){
Text[] texts = title.fragments();
String newTitle = "";
for (Text text : texts) {
newTitle += text;
}
sourceAsMap.put("title",newTitle);//高亮字段替换掉原来的内容即可
}
list.add(sourceAsMap);
}
return list;
}
编写controller
@Autowired
private ContentService contentService;
@GetMapping("/parse/{keywords}")
@ResponseBody
public Boolean parses(@PathVariable("keywords") String keywords) throws Exception {
return contentService.parseContents(keywords);
}
@ResponseBody
@GetMapping("/search/{keywords}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keywords") String keywords,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize") int pageSize) throws IOException {
return contentService.highlightBuilder(keywords,1,10);
}
3.2爬虫工具类
public class HtmlParseUtil {
public static void main(String[] args) throws Exception {
new HtmlParseUtil().params("码出高效").forEach(System.out::println);
}
public List<Content> params(String keywords) throws Exception {
//获取请求 https://search.jd.com/Search?keyword=java
//前提 需要联网
String url = "https://search.jd.com/Search?keyword="+keywords+"&enc=utf-8";
//解析网页
Document document = Jsoup.parse(new URL(url), 30000);
//所有你在js中可以使用的方法 这里都能用
Element element = document.getElementById("J_goodsList");
//获取所有的li元素
Elements elements = element.getElementsByTag("li");
ArrayList<Content> goodList = new ArrayList<>();
for (Element el : elements) {
//关于图片特别多的网站 所有图片都是延迟加载的data-lazy-img
String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setImg(img);
content.setTitle(title);
content.setPrice(price);
goodList.add(content);
}
return goodList;
}
}
3.3测试效果
待解决问题:
1、分词时 自定义分词 需手动建.dic 添加;
2、轮询时间如何设置为秒;
ElasticSearch入门学习笔记(一)概念篇
ElasticSearch入门学习笔记(二)软件安装篇
SpringBoot集成BBOSS-ElasticSearch实现ElasticSearch客户端
阿里云Docker安装ES\ES_Head\安装部署logstash导mysql数据入ElasticSearch