Elastic Search上传文件到索引库(doc,pdf,txt,ppt转string)

Elastic Search上传文件到索引库(doc,pdf,txt,ppt转string)

  • 本文通过fileservice服务将docx,pdf,txt,ppt等常见的文本形式转化为string类型存储到es索引库,使其完成对文档的索引,并对其进行高亮显示。
  • Myconfig.java (连接索引库的配置文件)
package com.location.elasticsearch.util;

import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import javax.annotation.PostConstruct;
import java.net.InetAddress;
import java.net.UnknownHostException;

/**
 * @author Thinkpad
 * @date 2020/3/20 14:55
 */
@Configuration
public class MyConfig {

    @PostConstruct
    public void init() {
        System.setProperty("es.set.netty.runtime.available.processors", "false");
    }
    @Bean
    public TransportClient client() throws UnknownHostException {
        TransportAddress node = new TransportAddress(
                InetAddress.getByName("192.168.1.72"),
                9300
        );
        Settings settings = Settings.builder()
                .put("cluster.name","elasticsearch")
                .build();
        TransportClient client = new PreBuiltTransportClient(settings);
        client.addTransportAddress(node);
        return client;
    }
}

  • EsAssistController.java (控制层)
package com.location.elasticsearch.controller;

import com.location.elasticsearch.domain.RetDTO;
import com.location.elasticsearch.domain.SearchDTO;
import com.location.elasticsearch.service.EsAssistService;
import com.location.elasticsearch.service.FileService;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;

import javax.annotation.Resource;
import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.*;


/**
 * @author csz
 * @date 2020/3/20 15:06
 */
@RestController
@RequestMapping("/es/assist")
@Slf4j
public class EsAssistController {

    @Autowired
    private TransportClient transportClient;

    @Autowired
    private FileService fileService;

    @Resource
    private EsAssistService esAssistService;

    /**
     * 根据id获取
     * @param id
     * @return
     */
    @GetMapping("/findAssistById")
    @ResponseBody
    public RetDTO get(@RequestParam(name = "id" , defaultValue = "") String id,
                      @RequestParam(name = "type" , defaultValue = "") String type) throws Exception{
        if (id.isEmpty()){
            return new RetDTO(404,"id为空");
        }

        GetResponse response = this.transportClient.prepareGet("text",type,id).get();

        if (!response.isExists()) {
            return new RetDTO(404,"未查到");
        }
        return RetDTO.getReturnJson(response.getSource());

    }


    /**
     * 上传文件到ES
     * @param title
     * @param author
     * @param type
     * @param file
     * @return
     * @throws Exception
     */
    @PostMapping("/insertAssist")
    @ResponseBody
    public RetDTO add(@RequestParam(name = "title" ) String title, @RequestParam(name = "author") String author,
                      @RequestParam(name = "type") String type, @RequestParam("file") MultipartFile file) throws Exception {
        HashMap<String,String> res = new HashMap();
        String string = null;
        string = fileService.getStringFromFile(file);
        Text text = new Text(string);
        try {
            XContentBuilder content =  XContentFactory.jsonBuilder().startObject().field("title",title)
                    .field("author",author).field("type",type).field("update_time",new SimpleDateFormat("yyyy-MM-dd").format(new Date()))
                    .field("TextContent",text).endObject();
            IndexResponse result = transportClient.prepareIndex("text","_doc").setSource(content).get();
            res.put("result",result.getId());
        } catch (Exception e){
            return new RetDTO(500,"上传失败");
        }
        return RetDTO.getReturnJson(res);
    }

    /**
     * 全文搜索
     * @param keyword
     * @return
     */
    @GetMapping("/findAll")
    @ResponseBody
    public RetDTO searchAll(@RequestParam(name = "keyword") String keyword) throws Exception{
        String key = URLDecoder.decode(URLDecoder.decode(keyword, "UTF-8"),"UTF-8");
        SearchDTO searchDTO = esAssistService.searchAll(key);
        if (searchDTO.getTotal() == 0){
            return new RetDTO(404,"没有找到该内容");
        }
        return RetDTO.getReturnJson(searchDTO);
    }



    /**
     * 根据关键字高亮进行搜索
     * @param keyword
     * @return
     */
    @RequestMapping("findHighlight")
    @ResponseBody
    public Map<String,Object> findHighlight(
            @RequestParam(value="keyword",defaultValue = "") String keyword
    ){
        //返回的map,进行数据封装
        Map<String,Object> msgMap = new HashMap<String,Object>();
        //建立bool查询,如果没有组合查询,直接写QueryBuilder
        BoolQueryBuilder boolBuilder = QueryBuilders.boolQuery();
        //使用should实现或者查询
        boolBuilder.must(QueryBuilders.matchQuery("TextContent",keyword));
        //c查询
        SearchRequestBuilder searchRequestBuilder = this.transportClient.prepareSearch("text")
                .setTypes("_doc")
                .setSearchType(SearchType.DFS_QUERY_THEN_FETCH) //设置查询类型:1.SearchType.DFS_QUERY_THEN_FETCH 精确查询; 2.SearchType.SCAN 扫描查询,无序
                .setQuery(boolBuilder);

        //设置高亮显示
        HighlightBuilder highlightBuilder = new HighlightBuilder().field("*").requireFieldMatch(false);
        highlightBuilder.preTags("<span style=\"color:red\">");
        highlightBuilder.postTags("</span>");
        searchRequestBuilder.highlighter(highlightBuilder);
        //执行结果
        SearchResponse response = searchRequestBuilder.get();
        //接受结果
        List<Map<String,Object>> result = new ArrayList<>();
        //遍历结果
        for(SearchHit hit:response.getHits()){
            Map<String, Object> source = hit.getSourceAsMap();
            //处理高亮片段
            Map<String, HighlightField> highlightFields = hit.getHighlightFields();
            HighlightField nameField = highlightFields.get("TextContent");
            if(nameField!=null){
                Text[] fragments = nameField.fragments();
                String nameTmp ="";
                for(Text text:fragments){
                    nameTmp+=text;
                }
                //将高亮片段组装到结果中去
                source.put("TextContent",nameTmp);
            }
            result.add(source);
        }
        //封装数据返回
        msgMap.put("itemsList",result);     //搜索结果
        msgMap.put("took",response.getTook().getSecondsFrac()); //获取响应需要的时间
        return msgMap;
    }

    /**
     * 删除
     * @param id
     * @return
     */
    @DeleteMapping("/deleteOneAssist")
    @ResponseBody
    public RetDTO delete( @RequestParam(name = "id") String id) {
        DeleteResponse response =  transportClient.prepareDelete("text","_doc",id).get();
        Map<String, Object> result = new HashMap<>();
        result.put("result",response.getResult().toString());
        return RetDTO.getReturnJson(result);
    }
}

  • EsAssistService.java (业务层)
package com.location.elasticsearch.service;

import com.location.elasticsearch.domain.SearchDTO;
import com.location.elasticsearch.domain.EsAssistDTO;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;


/**
 * @author csz
 * @date 2020/3/20 15:06
 */
@Service
public class EsAssistService {

    @Autowired
    private TransportClient transportClient;

    /**
     *全文查询
     * @param keyword
     * @return
     */
    public SearchDTO searchAll(String keyword) {
        // 查询索引
        SearchRequestBuilder search = transportClient.prepareSearch("text");

        QueryStringQueryBuilder qs = new QueryStringQueryBuilder(keyword);
        // 最匹配的在前
        qs.minimumShouldMatch("100%");
        qs.useDisMax(true);
        search.setQuery(qs);
        SearchResponse response = search.get();

        //查询到的数据是一个list
        List<EsAssistDTO> esAssistDTOList = new ArrayList<>();
        // 将HIT转对象
        EsAssistDTO esAssistDTO;
        for(SearchHit hit : response.getHits()) {
            esAssistDTO = new EsAssistDTO();
            esAssistDTO.setId(hit.getId());
            esAssistDTO.setType(hit.getType());
            Map<String, Object> source = hit.getSourceAsMap();
            String title = (String) source.get("title");
            String author = (String) source.get("author");
            String date = (String) source.get("update_time");
            String text = (String) source.get("TextContent");
            esAssistDTO.setTitle(title);
            esAssistDTO.setAuthor(author);
            esAssistDTO.setUpdate_time(date);
            esAssistDTO.setTextContent(text);
            esAssistDTOList.add(esAssistDTO);
        }
        SearchDTO searchDTO = new SearchDTO();
        searchDTO.setTotal(esAssistDTOList.size());
        searchDTO.setEsAssistDTOList(esAssistDTOList);
        return searchDTO;
    }
}

  • FileService.java (文件业务层)
package com.location.elasticsearch.service;

import com.location.elasticsearch.util.TextUtil;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;

/**
 * @author csz
 * @date 2020/3/20 15:06
 */
@Service
public class FileService {


    /**
     * 从多种格式文档中读取内容到字符串中
     * @param file
     * @return
     * @throws IOException
     */
    public String getStringFromFile(MultipartFile file) throws Exception {

        //判断文件是否合法,通过文件后缀
        int dotPos = file.getOriginalFilename().lastIndexOf(".");
        if (dotPos < 0) {
            return "非法文件";
        }
        String fileExt = file.getOriginalFilename().substring(dotPos + 1).toLowerCase();

        //判断后缀
        if (!TextUtil.isFileAllowed(fileExt)) {
            return null;
        }

        String fileName = file.getOriginalFilename();
        Files.copy(file.getInputStream(), new File(TextUtil.FILE_DIR + fileName).toPath(),
                StandardCopyOption.REPLACE_EXISTING);
        String string = "";
         if (fileExt.equals("txt")) {
             string = TextUtil.getTextFromTxt(TextUtil.FILE_DIR + fileName);
         }else if (fileExt.equals("doc")){
             string = TextUtil.getTextFromDoc(TextUtil.FILE_DIR + fileName);
         }else if (fileExt.equals("docx")) {
             string = TextUtil.getTextFromDocx(TextUtil.FILE_DIR + fileName);
         } else if (fileExt.equals("pdf")) {
             string = TextUtil.getTextFromPDF(TextUtil.FILE_DIR + fileName);
         } else if (fileExt.equals("ppt")) {
             string = TextUtil.getTextFromPPT(TextUtil.FILE_DIR + fileName);
         }else {
             string = "当前不支持此类型";
         }
        return string;
    }

}

  • EsAssistDTO.java (实体类)
package com.location.elasticsearch.domain;


/**
 * @author csz
 * @date 2020/3/20 15:06
 */
public class EsAssistDTO {
    private String id;
    private String type;
    private String title;
    private String author;
    private String update_time;
    private String TextContent;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getUpdate_time() {
        return update_time;
    }

    public void setUpdate_time(String update_time) {
        this.update_time = update_time;
    }

    public String getTextContent() {
        return TextContent;
    }

    public void setTextContent(String textContent) {
        TextContent = textContent;
    }
}

  • SearchDTO.java (实体类)
package com.location.elasticsearch.domain;


import java.util.List;

/**
 * @author csz
 * @date 2020/3/20 15:06
 */
public class SearchDTO {
    private int total;
    private List<EsAssistDTO> esAssistDTOList;

    public int getTotal() {
        return total;
    }

    public void setTotal(int total) {
        this.total = total;
    }

    public List<EsAssistDTO> getEsAssistDTOList() {
        return esAssistDTOList;
    }

    public void setEsAssistDTOList(List<EsAssistDTO> esAssistDTOList) {
        this.esAssistDTOList = esAssistDTOList;
    }
}

  • RetDTO.java (返回json)
package com.location.elasticsearch.domain;

import org.springframework.http.HttpStatus;

/**
 * @author csz
 * @date 2020/3/20 15:06
 */
public class RetDTO<T> extends BaseModel{
    private int code;
    private String msg;
    private T data;

    public RetDTO(int code, String msg){
        this.code = code;
        this.msg = msg;
    }

    public RetDTO(int code, String msg, T data){
        this.code = code;
        this.msg = msg;
        this.data = data;
    }

    public static <E> RetDTO<E> getReturnJson(E data){
        return new RetDTO<E>(HttpStatus.OK.value(), HttpStatus.OK.name(), data);
    }


    public static <E> RetDTO<E> getReturnJson(String msg, E data){
        return new RetDTO<E>(HttpStatus.OK.value(), msg, data);
    }

    public int getCode() {
        return code;
    }

    public void setCode(int code) {
        this.code = code;
    }

    public String getMsg() {
        return msg;
    }

    public void setMsg(String msg) {
        this.msg = msg;
    }

    public T getData() {
        return data;
    }

    public void setData(T data) {
        this.data = data;
    }
}

  • TextUtil.java (文本工具类)
package com.location.elasticsearch.util;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.*;

/**
 * @author csz
 * @date 2020/3/20 15:06
 */
public class TextUtil {

    public static String FILE_DIR = "C:/Users/Thinkpad/Desktop/document/elasticSearch/es_note/";
    public static String[] TEXT_FILE_EXTD = new String[] {"txt", "doc", "docx","pdf","ppt"};


    /**
     * 判断是否合法
     * @param ext
     * @return
     */
    public static boolean isFileAllowed(String ext) {
        for (String format:TEXT_FILE_EXTD) {
            if (ext.toLowerCase().equals(format)) {
                return true;
            }
        }
        return false;
    }

    /**
     * 用来读取txt文件的方法
     * @param fileName
     * @return
     */
    public static String getTextFromTxt(String fileName) {
        String encoding = "gbk";
        File file = new File(fileName);
        Long filelength = file.length();
        byte[] filecontent = new byte[filelength.intValue()];
        try {
            FileInputStream in = new FileInputStream(file);
            in.read(filecontent);
            in.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            return new String(filecontent, encoding);
        } catch (UnsupportedEncodingException e) {
            System.err.println("The OS does not support " + encoding);
            e.printStackTrace();
            return null;
        }
    }
    
    /**
     * 用来读取doc文件的方法
     * @param filePath
     * @return
     * @throws Exception
     */
    public static String getTextFromDoc(String filePath) throws Exception{

        FileInputStream fis = new FileInputStream(new File(filePath));
        WordExtractor extractor = new WordExtractor(fis);

        return extractor.getText();

    }

    /**
     * 用来读取docx文件
     * @param filePath
     * @return
     * @throws IOException
     * @throws Exception
     */
    @SuppressWarnings("resource")
    public static String getTextFromDocx(String filePath) throws IOException {
        FileInputStream in = new FileInputStream(filePath);
        XWPFDocument doc = new XWPFDocument(in);
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        String text = extractor.getText();
        in.close();
        return text;
    }

    /**
     * 用来读取pdf文件
     * @param filePath
     * @return
     * @throws IOException

    public static String getTextFromPDF(String filePath) throws IOException{
        File input = new File(filePath);
        PDDocument pd = PDDocument.load(input);
        PDFTextStripper stripper = new PDFTextStripper();
        String res = stripper.getText(pd);
        pd.close();
        return res;
    }*/

    /**
     * 用来读取pdf文件
     * @param filePath
     * @return
     * @throws IOException
     */
    public static String getTextFromPDF(String filePath) throws IOException{
        File input = new File(filePath);
        PDDocument pd = PDDocument.load(input);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        OutputStreamWriter output = new OutputStreamWriter(baos);
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.writeText(pd,output);
        pd.close();
        output.close();
        return baos.toString();
    }


    /**
     * 用来读取ppt文件
     * @param filePath
     * @return
     * @throws IOException
     */
    public static String getTextFromPPT( String filePath) throws IOException{
        FileInputStream in = new FileInputStream(filePath);
        PowerPointExtractor extractor = new PowerPointExtractor(in);
        String content = extractor.getText();
        return content;
    }


}

发布了25 篇原创文章 · 获赞 4 · 访问量 5106
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 精致技术 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览