springboot整合elasticsearch进行基本操作,并将爬取的京东搜索信息批量插入到elasticsearch中并进行查询与高亮查询。
1. 创建项目引入elasticsearch依赖(注意版本要一致)
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.70</version>
</dependency>
2. 编写elasticsearch连接配置文件
@Configuration
public class elasticsearchConfig {
@Bean
public RestHighLevelClient restHighLevelClient(){
RestHighLevelClient client = new RestHighLevelClient(
RestClient.builder(new HttpHost("127.0.0.1",9200,"http")));
return client;
}
}
3. 在测试类中进行测试
- 创建索引
@SpringBootTest
class EsApiApplicationTests {
@Autowired
private RestHighLevelClient restHighLevelClient;
//创建索引
@Test
void createTest() throws IOException {
//创建索引请求
CreateIndexRequest indexRequest = new CreateIndexRequest("jd_db");
//客户端执行请求,请求后获得响应
CreateIndexResponse createIndexResponse = restHighLevelClient.indices().create(indexRequest, RequestOptions.DEFAULT);
System.out.println(createIndexResponse);
}
}
- 获取索引
// 获取索引
@Test
void getTest() throws IOException {
GetIndexRequest getIndexRequest = new GetIndexRequest("db_test");
boolean exists = restHighLevelClient.indices().exists(getIndexRequest, RequestOptions.DEFAULT);
System.out.println(exists);
}
- 删除索引
// 删除索引
@Test
void deleteTest() throws IOException{
DeleteIndexRequest deleteRequest = new DeleteIndexRequest("db_test");
AcknowledgedResponse acknowledgedResponse = restHighLevelClient.indices().delete(deleteRequest,RequestOptions.DEFAULT);
System.out.println(acknowledgedResponse);
}
- 向文档插入数据
// 操作文档
@Test
void addDocument() throws IOException {
//创建对象
Book book = new Book("java",12);
//创建请求
IndexRequest indexRequest = new IndexRequest("db_test");
//操作的id
indexRequest.id("1");
// 设置超时时间
indexRequest.timeout("2s");
// 将数据转成json并放入请求
indexRequest.source(JSON.toJSONString(book), XContentType.JSON);
// 向客户端发送请求
IndexResponse indexResponse = restHighLevelClient.index(indexRequest,RequestOptions.DEFAULT);
System.out.println(indexResponse.toString());
System.out.println(indexResponse.status());
}
- 判断文档是否储存在
// 获取文档判断是否存在\
@Test
void exitsDcoument() throws IOException {
GetRequest getRequest = new GetRequest("db_test","1");
// 不获取返回的上下文
getRequest.fetchSourceContext(new FetchSourceContext(false));
getRequest.storedFields("_none_");
boolean exists = restHighLevelClient.exists(getRequest, RequestOptions.DEFAULT);
System.out.println(exists);
}
- 获取文档信息
// 获取文档信息
@Test
void getDocument() throws IOException {
GetRequest getRequest = new GetRequest("db_test","1");
GetResponse getResponse = restHighLevelClient.get(getRequest, RequestOptions.DEFAULT);
System.out.println(getResponse.getSourceAsString()); //打印文档信息
System.out.println(getResponse);
}
- 更新文档信息
// 更新文档信息
@Test
void updateDocument() throws IOException {
UpdateRequest updateRequest = new UpdateRequest("db_test","1");
Book book = new Book("C++",13);
updateRequest.doc(JSON.toJSONString(book),XContentType.JSON);
UpdateResponse update = restHighLevelClient.update(updateRequest, RequestOptions.DEFAULT);
System.out.println(update.status());
}
- 删除文档信息
// 删除文档信息
@Test
void deleteDocument() throws IOException {
DeleteRequest deleteRequest = new DeleteRequest("db_test","1");
DeleteResponse deleteResponse = restHighLevelClient.delete(deleteRequest, RequestOptions.DEFAULT);
System.out.println(deleteResponse.status());
}
- 批量插入数据
// 批量插入数据
@Test
void bulkDocument() throws IOException {
BulkRequest bulkRequest = new BulkRequest();
ArrayList arrayList = new ArrayList();
arrayList.add(new Book("python",55));
arrayList.add(new Book("php",53));
arrayList.add(new Book("Go",34));
for (int i = 0; i < arrayList.size(); i++) {
bulkRequest.add(
new IndexRequest("db_test").id(""+i+1).source(JSON.toJSONString(arrayList.get(i)),XContentType.JSON)
);
}
// 客户端发送请求
BulkResponse itemResponses = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
System.out.println(itemResponses.hasFailures());
}
- 查询文档信息
// 查询
@Test
void searchDocument() throws IOException {
// 查询请求
SearchRequest searchRequest = new SearchRequest();
// 构建查询请求
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("name", "php");
searchSourceBuilder.query(termQueryBuilder);
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
System.out.println(JSON.toJSONString(searchResponse.getHits()));
System.out.println("------------------------------------");
for (SearchHit searchHit: searchResponse.getHits().getHits()) {
System.out.println(searchHit.getSourceAsMap());
}
}
4. 将爬取的京东搜索信息批量插入到elasticsearch中并进行查询与高亮查询
- 引入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.70</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
- 编写实体类
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Content {
private String img;
private String price;
private String pname;
}
- 编写html解析工具类
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
new HtmlParseUtil().getHtmlByJd("java").forEach(System.out::println);
}
public ArrayList<Content> getHtmlByJd(String keyword) throws IOException {
ArrayList<Content> arrayList = new ArrayList<>();
//https://search.jd.com/Search?keyword=java
String url = "https://search.jd.com/Search?keyword="+keyword;
// 解析网页
Document document = Jsoup.parse(new URL(url), 30000);
Element goodsList = document.getElementById("J_goodsList");
Elements goodsListElementsByTag = goodsList.getElementsByTag("li");
for (Element el:goodsListElementsByTag) {
String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = el.getElementsByClass("p-price").eq(0).text();
String pname = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setImg(img);
content.setPrice(price);
content.setPname(pname);
arrayList.add(content);
}
return arrayList;
}
}
- 编写controller
@RestController
public class ContentController {
@Autowired
private ContentService contentService;
@GetMapping("/parseHtml/{keyword}")
public boolean parseHtml(@PathVariable String keyword) throws Exception {
Boolean aBoolean = this.contentService.parseHtml(keyword);
return aBoolean;
}
@GetMapping("/searchByJd/{keyword}/{pageNum}/{pageSize}")
public List<Map<String,Object>> searchByJd(@PathVariable String keyword,
@PathVariable int pageNum,
@PathVariable int pageSize) throws IOException {
List<Map<String,Object>> searchByJdResult = this.contentService.searchByJd(keyword,pageNum,pageSize);
return searchByJdResult;
}
}
- 编写service 将数据批量插入到elasticsearch
@Service
public class ContentServiceimpl implements ContentService {
@Autowired
private HtmlParseUtil htmlParseUtil;
@Autowired
private RestHighLevelClient restHighLevelClient;
public boolean parseHtml(String keyword) throws IOException {
ArrayList<Content> htmlByJds = htmlParseUtil.getHtmlByJd(keyword);
BulkRequest bulkRequest = new BulkRequest();
for (int i = 0; i < htmlByJds.size(); i++) {
bulkRequest.add(
new IndexRequest("jd_db").source(JSON.toJSONString(htmlByJds.get(i)), XContentType.JSON)
);
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulkResponse.hasFailures();
}
/**
* 查询数据
* @param keyword
* @param pageNum
* @param pageSize
* @return
*/
@Override
public List<Map<String, Object>> searchByJd(String keyword, int pageNum, int pageSize) throws IOException {
/**
* 构建搜索条件
*/
SearchRequest searchRequest = new SearchRequest("jd_db");
// 条件构建
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//设置分页
searchSourceBuilder.from(pageNum);
searchSourceBuilder.size(pageSize);
//精确匹配
TermQueryBuilder queryBuilder = QueryBuilders.termQuery("pname", keyword);
searchSourceBuilder.query(queryBuilder);
//设置超时时间
searchSourceBuilder.timeout(new TimeValue(30, TimeUnit.SECONDS));
searchRequest.source(searchSourceBuilder);
//客户端执行结果
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
List<Map<String, Object>> htiList = new ArrayList<>();
for (SearchHit hit:searchResponse.getHits().getHits()) {
htiList.add(hit.getSourceAsMap());
}
return htiList;
}
/**
* 高亮显示 highlighter
*/
public List<Map<String, Object>> searchByJdHighlighter(String keyword, int pageNum, int pageSize) throws IOException {
/**
* 构建搜索条件
*/
SearchRequest searchRequest = new SearchRequest("jd_db");
// 条件构建
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//设置分页
searchSourceBuilder.from(pageNum);
searchSourceBuilder.size(pageSize);
//精确匹配
TermQueryBuilder queryBuilder = QueryBuilders.termQuery("pname", keyword);
/**
* 设置高亮
*/
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("pname");
highlightBuilder.requireFieldMatch(false); //是否重复高亮
highlightBuilder.preTags("<span style='color:red'>");
highlightBuilder.postTags("</span>");
searchSourceBuilder.highlighter(highlightBuilder);
searchSourceBuilder.query(queryBuilder);
//设置超时时间
searchSourceBuilder.timeout(new TimeValue(30, TimeUnit.SECONDS));
searchRequest.source(searchSourceBuilder);
//客户端执行结果
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
List<Map<String, Object>> htiList = new ArrayList<>();
for (SearchHit hit:searchResponse.getHits().getHits()) {
Map<String, HighlightField> highlightFields = hit.getHighlightFields();
HighlightField pname = highlightFields.get("pname");
Map<String, Object> sourceAsMap = hit.getSourceAsMap();//原结果
if (pname!=null){
Text[] fragments = pname.fragments();
String n_text = "";
for (Text text:fragments) {
n_text += text;
}
// 替换原结果
sourceAsMap.put("pname",n_text);
}
htiList.add(sourceAsMap);
}
return htiList;
}
}
-
查看索引中插入的数据
-
启动项目后输入关键字进行查询