Elastic Search上传文件到索引库(doc,pdf,txt,ppt转string)
- 本文通过fileservice服务将docx,pdf,txt,ppt等常见的文本形式转化为string类型存储到es索引库,使其完成对文档的索引,并对其进行高亮显示。
- Myconfig.java (连接索引库的配置文件)
package com.location.elasticsearch.util;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import javax.annotation.PostConstruct;
import java.net.InetAddress;
import java.net.UnknownHostException;
@Configuration
public class MyConfig {
@PostConstruct
public void init() {
System.setProperty("es.set.netty.runtime.available.processors", "false");
}
@Bean
public TransportClient client() throws UnknownHostException {
TransportAddress node = new TransportAddress(
InetAddress.getByName("192.168.1.72"),
9300
);
Settings settings = Settings.builder()
.put("cluster.name","elasticsearch")
.build();
TransportClient client = new PreBuiltTransportClient(settings);
client.addTransportAddress(node);
return client;
}
}
- EsAssistController.java (控制层)
package com.location.elasticsearch.controller;
import com.location.elasticsearch.domain.RetDTO;
import com.location.elasticsearch.domain.SearchDTO;
import com.location.elasticsearch.service.EsAssistService;
import com.location.elasticsearch.service.FileService;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.Resource;
import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.*;
@RestController
@RequestMapping("/es/assist")
@Slf4j
public class EsAssistController {
@Autowired
private TransportClient transportClient;
@Autowired
private FileService fileService;
@Resource
private EsAssistService esAssistService;
@GetMapping("/findAssistById")
@ResponseBody
public RetDTO get(@RequestParam(name = "id" , defaultValue = "") String id,
@RequestParam(name = "type" , defaultValue = "") String type) throws Exception{
if (id.isEmpty()){
return new RetDTO(404,"id为空");
}
GetResponse response = this.transportClient.prepareGet("text",type,id).get();
if (!response.isExists()) {
return new RetDTO(404,"未查到");
}
return RetDTO.getReturnJson(response.getSource());
}
@PostMapping("/insertAssist")
@ResponseBody
public RetDTO add(@RequestParam(name = "title" ) String title, @RequestParam(name = "author") String author,
@RequestParam(name = "type") String type, @RequestParam("file") MultipartFile file) throws Exception {
HashMap<String,String> res = new HashMap();
String string = null;
string = fileService.getStringFromFile(file);
Text text = new Text(string);
try {
XContentBuilder content = XContentFactory.jsonBuilder().startObject().field("title",title)
.field("author",author).field("type",type).field("update_time",new SimpleDateFormat("yyyy-MM-dd").format(new Date()))
.field("TextContent",text).endObject();
IndexResponse result = transportClient.prepareIndex("text","_doc").setSource(content).get();
res.put("result",result.getId());
} catch (Exception e){
return new RetDTO(500,"上传失败");
}
return RetDTO.getReturnJson(res);
}
@GetMapping("/findAll")
@ResponseBody
public RetDTO searchAll(@RequestParam(name = "keyword") String keyword) throws Exception{
String key = URLDecoder.decode(URLDecoder.decode(keyword, "UTF-8"),"UTF-8");
SearchDTO searchDTO = esAssistService.searchAll(key);
if (searchDTO.getTotal() == 0){
return new RetDTO(404,"没有找到该内容");
}
return RetDTO.getReturnJson(searchDTO);
}
@RequestMapping("findHighlight")
@ResponseBody
public Map<String,Object> findHighlight(
@RequestParam(value="keyword",defaultValue = "") String keyword
){
Map<String,Object> msgMap = new HashMap<String,Object>();
BoolQueryBuilder boolBuilder = QueryBuilders.boolQuery();
boolBuilder.must(QueryBuilders.matchQuery("TextContent",keyword));
SearchRequestBuilder searchRequestBuilder = this.transportClient.prepareSearch("text")
.setTypes("_doc")
.setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
.setQuery(boolBuilder);
HighlightBuilder highlightBuilder = new HighlightBuilder().field("*").requireFieldMatch(false);
highlightBuilder.preTags("<span style=\"color:red\">");
highlightBuilder.postTags("</span>");
searchRequestBuilder.highlighter(highlightBuilder);
SearchResponse response = searchRequestBuilder.get();
List<Map<String,Object>> result = new ArrayList<>();
for(SearchHit hit:response.getHits()){
Map<String, Object> source = hit.getSourceAsMap();
Map<String, HighlightField> highlightFields = hit.getHighlightFields();
HighlightField nameField = highlightFields.get("TextContent");
if(nameField!=null){
Text[] fragments = nameField.fragments();
String nameTmp ="";
for(Text text:fragments){
nameTmp+=text;
}
source.put("TextContent",nameTmp);
}
result.add(source);
}
msgMap.put("itemsList",result);
msgMap.put("took",response.getTook().getSecondsFrac());
return msgMap;
}
@DeleteMapping("/deleteOneAssist")
@ResponseBody
public RetDTO delete( @RequestParam(name = "id") String id) {
DeleteResponse response = transportClient.prepareDelete("text","_doc",id).get();
Map<String, Object> result = new HashMap<>();
result.put("result",response.getResult().toString());
return RetDTO.getReturnJson(result);
}
}
- EsAssistService.java (业务层)
package com.location.elasticsearch.service;
import com.location.elasticsearch.domain.SearchDTO;
import com.location.elasticsearch.domain.EsAssistDTO;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Service
public class EsAssistService {
@Autowired
private TransportClient transportClient;
public SearchDTO searchAll(String keyword) {
SearchRequestBuilder search = transportClient.prepareSearch("text");
QueryStringQueryBuilder qs = new QueryStringQueryBuilder(keyword);
qs.minimumShouldMatch("100%");
qs.useDisMax(true);
search.setQuery(qs);
SearchResponse response = search.get();
List<EsAssistDTO> esAssistDTOList = new ArrayList<>();
EsAssistDTO esAssistDTO;
for(SearchHit hit : response.getHits()) {
esAssistDTO = new EsAssistDTO();
esAssistDTO.setId(hit.getId());
esAssistDTO.setType(hit.getType());
Map<String, Object> source = hit.getSourceAsMap();
String title = (String) source.get("title");
String author = (String) source.get("author");
String date = (String) source.get("update_time");
String text = (String) source.get("TextContent");
esAssistDTO.setTitle(title);
esAssistDTO.setAuthor(author);
esAssistDTO.setUpdate_time(date);
esAssistDTO.setTextContent(text);
esAssistDTOList.add(esAssistDTO);
}
SearchDTO searchDTO = new SearchDTO();
searchDTO.setTotal(esAssistDTOList.size());
searchDTO.setEsAssistDTOList(esAssistDTOList);
return searchDTO;
}
}
package com.location.elasticsearch.service;
import com.location.elasticsearch.util.TextUtil;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
@Service
public class FileService {
public String getStringFromFile(MultipartFile file) throws Exception {
int dotPos = file.getOriginalFilename().lastIndexOf(".");
if (dotPos < 0) {
return "非法文件";
}
String fileExt = file.getOriginalFilename().substring(dotPos + 1).toLowerCase();
if (!TextUtil.isFileAllowed(fileExt)) {
return null;
}
String fileName = file.getOriginalFilename();
Files.copy(file.getInputStream(), new File(TextUtil.FILE_DIR + fileName).toPath(),
StandardCopyOption.REPLACE_EXISTING);
String string = "";
if (fileExt.equals("txt")) {
string = TextUtil.getTextFromTxt(TextUtil.FILE_DIR + fileName);
}else if (fileExt.equals("doc")){
string = TextUtil.getTextFromDoc(TextUtil.FILE_DIR + fileName);
}else if (fileExt.equals("docx")) {
string = TextUtil.getTextFromDocx(TextUtil.FILE_DIR + fileName);
} else if (fileExt.equals("pdf")) {
string = TextUtil.getTextFromPDF(TextUtil.FILE_DIR + fileName);
} else if (fileExt.equals("ppt")) {
string = TextUtil.getTextFromPPT(TextUtil.FILE_DIR + fileName);
}else {
string = "当前不支持此类型";
}
return string;
}
}
package com.location.elasticsearch.domain;
public class EsAssistDTO {
private String id;
private String type;
private String title;
private String author;
private String update_time;
private String TextContent;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getUpdate_time() {
return update_time;
}
public void setUpdate_time(String update_time) {
this.update_time = update_time;
}
public String getTextContent() {
return TextContent;
}
public void setTextContent(String textContent) {
TextContent = textContent;
}
}
package com.location.elasticsearch.domain;
import java.util.List;
public class SearchDTO {
private int total;
private List<EsAssistDTO> esAssistDTOList;
public int getTotal() {
return total;
}
public void setTotal(int total) {
this.total = total;
}
public List<EsAssistDTO> getEsAssistDTOList() {
return esAssistDTOList;
}
public void setEsAssistDTOList(List<EsAssistDTO> esAssistDTOList) {
this.esAssistDTOList = esAssistDTOList;
}
}
package com.location.elasticsearch.domain;
import org.springframework.http.HttpStatus;
public class RetDTO<T> extends BaseModel{
private int code;
private String msg;
private T data;
public RetDTO(int code, String msg){
this.code = code;
this.msg = msg;
}
public RetDTO(int code, String msg, T data){
this.code = code;
this.msg = msg;
this.data = data;
}
public static <E> RetDTO<E> getReturnJson(E data){
return new RetDTO<E>(HttpStatus.OK.value(), HttpStatus.OK.name(), data);
}
public static <E> RetDTO<E> getReturnJson(String msg, E data){
return new RetDTO<E>(HttpStatus.OK.value(), msg, data);
}
public int getCode() {
return code;
}
public void setCode(int code) {
this.code = code;
}
public String getMsg() {
return msg;
}
public void setMsg(String msg) {
this.msg = msg;
}
public T getData() {
return data;
}
public void setData(T data) {
this.data = data;
}
}
package com.location.elasticsearch.util;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.*;
public class TextUtil {
public static String FILE_DIR = "C:/Users/Thinkpad/Desktop/document/elasticSearch/es_note/";
public static String[] TEXT_FILE_EXTD = new String[] {"txt", "doc", "docx","pdf","ppt"};
public static boolean isFileAllowed(String ext) {
for (String format:TEXT_FILE_EXTD) {
if (ext.toLowerCase().equals(format)) {
return true;
}
}
return false;
}
public static String getTextFromTxt(String fileName) {
String encoding = "gbk";
File file = new File(fileName);
Long filelength = file.length();
byte[] filecontent = new byte[filelength.intValue()];
try {
FileInputStream in = new FileInputStream(file);
in.read(filecontent);
in.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
return new String(filecontent, encoding);
} catch (UnsupportedEncodingException e) {
System.err.println("The OS does not support " + encoding);
e.printStackTrace();
return null;
}
}
public static String getTextFromDoc(String filePath) throws Exception{
FileInputStream fis = new FileInputStream(new File(filePath));
WordExtractor extractor = new WordExtractor(fis);
return extractor.getText();
}
@SuppressWarnings("resource")
public static String getTextFromDocx(String filePath) throws IOException {
FileInputStream in = new FileInputStream(filePath);
XWPFDocument doc = new XWPFDocument(in);
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
in.close();
return text;
}
public static String getTextFromPDF(String filePath) throws IOException{
File input = new File(filePath);
PDDocument pd = PDDocument.load(input);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStreamWriter output = new OutputStreamWriter(baos);
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pd,output);
pd.close();
output.close();
return baos.toString();
}
public static String getTextFromPPT( String filePath) throws IOException{
FileInputStream in = new FileInputStream(filePath);
PowerPointExtractor extractor = new PowerPointExtractor(in);
String content = extractor.getText();
return content;
}
}