解析对象存储oss中的DOC、DOCX、PDF、PPT、PPTX、XLS、XLSX、TXT类型文件获取文本数据插入es进行精准或模糊检索

最新推荐文章于 2024-07-21 16:25:22 发布

凭君莫话封侯事.

最新推荐文章于 2024-07-21 16:25:22 发布

阅读量1k

点赞数

文章标签： elasticsearch java 大数据

本文链接：https://blog.csdn.net/weixin_47825079/article/details/130011763

版权

企业文件管理系统用户在使用过程中往往不清楚具体的文件名称无法精准的查找文件。因此需要根据已知的关键字精准或模糊的获取匹配文件列表及文件中的关键内容【包含关键字内容】以便获取需要查找的文件。

技术点

多线程异步解析文件【多线程异步解析功能来源于一次用户批量上传2万+份文件改造】、es数据插入删除、检索

解析文件

第一步：连接oss

maven依赖
<dependency>
  <groupId>com.aliyun.oss</groupId>
  <artifactId>aliyun-sdk-oss</artifactId>
  <version>3.14.0</version>
</dependency>

endpoint=http://oss-cn-******
accessKeyId=******
accessKeySecret=******
OSS oss = new OSSClientBuilder().build(ENDPOINT, AK, AS);

第二步：获取inputstream

OSSObject ossObject = ossClient.getObject(new GetObjectRequest(BUCKET_NAME, url));
InputStream inputStream = ossObject.getObjectContent();

第三步：文件解析

maven依赖
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.2</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml-schemas</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.14</version>
</dependency>
<dependency>
    <groupId>org.apache.xmlbeans</groupId>
    <artifactId>xmlbeans</artifactId>
    <version>2.6.0</version>
</dependency>

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;

import java.io.*;
import java.text.NumberFormat;
import java.util.List;

public class ParseOssFile {

    public static String getTextFromText(InputStream inputStream) throws IOException {
        StringBuilder builder = new StringBuilder();
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        String line;
        while ((line = reader.readLine()) != null) {
            builder.append(line);
        }
        reader.close();
        return builder.toString();
    }

    public static String getTextFromDoc(InputStream inputStream) throws Exception {
        WordExtractor wordExtractor = new WordExtractor(inputStream);
        String text = wordExtractor.getText().replace("\r\n", "").trim();
        wordExtractor.close();
        return text;
    }

    public static void main(String[] args) throws IOException {
        HWPFDocument doc = new HWPFDocument(new FileInputStream("HIVE数仓建模培训 示例代码v1.2.doc"));
        Range r = doc.getRange();// 文档范围
        for (int i = 0; i < r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);// 获取段落
            int numStyles = doc.getStyleSheet().numStyles();
            int styleIndex = p.getStyleIndex();
            if (numStyles > styleIndex) {
                StyleSheet style_sheet = doc.getStyleSheet();
                StyleDescription style = style_sheet.getStyleDescription(styleIndex);
                String styleName = style.getName();// 获取每个段落样式名称
                //System.out.println(style_sheet);
                //System.out.println(styleName);
                // 获取自己理想样式的段落文本信息
                String styleLoving = "标题";
                String text = p.text();// 段落文本
                //if (styleName != null && styleName.contains(styleLoving)) {
                if (styleName.contains("标题")) {
                    System.out.println(text);
                }
            }
        }

    }

    @SuppressWarnings("resource")
    public static String getTextFromDocx(InputStream inputStream) throws IOException {
        XWPFDocument doc = new XWPFDocument(inputStream);
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        //获取图片
        /*List<XWPFPictureData> allPictures = doc.getAllPictures();
        for (XWPFPictureData allPicture : allPictures) {
            byte[] data = allPicture.getData();
            FileOutputStream outputStream = new FileOutputStream("C:\\Users\\86187\\Desktop\\" + UUID.randomUUID().toString());
            outputStream.write(data);
            outputStream.flush();
        }*/
        String text = extractor.getText().replace("\r\n", "").replace("\n", "").trim();
        extractor.close();
        doc.close();
        return text;
    }


    public static String getTextFromPDF(InputStream inputStream) throws IOException {
        PDDocument pd = PDDocument.load(inputStream);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(pd);
        pd.close();
        return text;
    }

    public static String getTextFromPPT(InputStream inputStream) throws IOException {
        PowerPointExtractor extractor = new PowerPointExtractor(inputStream);
        String content = extractor.getText();
        extractor.close();
        return content;
    }

    public static String getTextFromPPTX(InputStream inputStream) throws IOException {
        String resultString = null;
        StringBuilder sb = new StringBuilder();
        try {
            XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
            List<XSLFSlide> slides = xmlSlideShow.getSlides();
            for (XSLFSlide slide : slides) {
                CTSlide rawSlide = slide.getXmlObject();
                CTGroupShape gs = rawSlide.getCSld().getSpTree();
                CTShape[] shapes = gs.getSpArray();
                for (CTShape shape : shapes) {
                    CTTextBody tb = shape.getTxBody();
                    if (null == tb) {
                        continue;
                    }
                    CTTextParagraph[] paras = tb.getPArray();
                    for (CTTextParagraph textParagraph : paras) {
                        CTRegularTextRun[] textRuns = textParagraph.getRArray();
                        for (CTRegularTextRun textRun : textRuns) {
                            sb.append(textRun.getT());
                        }
                    }
                }
            }
            resultString = sb.toString();
            xmlSlideShow.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return resultString;
    }

    public static String getTextFromxls(InputStream inputStream) throws IOException {
        StringBuilder content = new StringBuilder();
        HSSFWorkbook workbook = new HSSFWorkbook(inputStream);
        for (int sheetIndex = 0; sheetIndex < workbook.getNumberOfSheets(); sheetIndex++) {
            HSSFSheet sheet = workbook.getSheetAt(sheetIndex);
            for (int rowIndex = 0; rowIndex <= sheet.getLastRowNum(); rowIndex++) {
                HSSFRow row = sheet.getRow(rowIndex);
                if (row == null) {
                    continue;
                }
                for (int cellnum = 0; cellnum < row.getLastCellNum(); cellnum++) {
                    HSSFCell cell = row.getCell(cellnum);
                    if (cell != null) {
                        content.append(cell.getRichStringCellValue().getString() + " ");
                    }

                }
            }

        }
        workbook.close();
        return content.toString();

    }

    public static String getTextFromxlsx(InputStream inputStream) throws IOException {
        StringBuilder content = new StringBuilder();
        XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
        for (int sheet = 0; sheet < workbook.getNumberOfSheets(); sheet++) {
            if (null != workbook.getSheetAt(sheet)) {
                XSSFSheet aSheet = workbook.getSheetAt(sheet);
                for (int row = 0; row <= aSheet.getLastRowNum(); row++) {
                    if (null != aSheet.getRow(row)) {
                        XSSFRow aRow = aSheet.getRow(row);
                        for (int cell = 0; cell < aRow.getLastCellNum(); cell++) {
                            if (null != aRow.getCell(cell)) {
                                XSSFCell aCell = aRow.getCell(cell);
                                if (convertCell(aCell).length() > 0) {
                                    content.append(convertCell(aCell));
                                }
                            }
                            content.append(" ");
                        }
                    }
                }
            }
        }
        workbook.close();
        return content.toString();

    }

    private static String convertCell(Cell cell) {
        NumberFormat formater = NumberFormat.getInstance();
        formater.setGroupingUsed(false);
        String cellValue = "";
        if (cell == null) {
            return cellValue;
        }

        switch (cell.getCellType()) {
            case HSSFCell.CELL_TYPE_NUMERIC:
                cellValue = formater.format(cell.getNumericCellValue());
                break;
            case HSSFCell.CELL_TYPE_STRING:
                cellValue = cell.getStringCellValue();
                break;
            case HSSFCell.CELL_TYPE_BLANK:
                cellValue = cell.getStringCellValue();
                break;
            case HSSFCell.CELL_TYPE_BOOLEAN:
                cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
                break;
            case HSSFCell.CELL_TYPE_ERROR:
                cellValue = String.valueOf(cell.getErrorCellValue());
                break;
            default:
                cellValue = "";
        }
        return cellValue.trim();
    }
}

第四步：多线程异步解析

private static final ThreadPoolExecutor THREAD_POOL_EXECUTOR = new ThreadPoolExecutor(10, 300, 0, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>());

THREAD_POOL_EXECUTOR.submit(new ParseFile(changeLog, out, new OSSClientBuilder().build(ENDPOINT, AK, AS)));

es数据插入删除

//数据插入
public static boolean put(CloseableHttpClient httpClient, String url, String content) throws IOException {
        HttpPut http = new HttpPut(url);
        http.setHeader("Content-Type", "application/json");
        http.setHeader("Authorization", Authorization);
        StringEntity stringEntity = new StringEntity(content, ContentType.create("text/json", "UTF-8"));
        http.setEntity(stringEntity);
        CloseableHttpResponse response = httpClient.execute(http);
        int statusCode = response.getStatusLine().getStatusCode();
        if (statusCode >= 200 && statusCode < 300) {
            httpClient.close();
            return true;
        } else {
            InputStream inputStream = response.getEntity().getContent();
            StringBuilder builder = new StringBuilder();
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
            String line;
            while ((line = reader.readLine()) != null) {
                builder.append(line);
            }
            LOGGER.error(builder.toString());
            httpClient.close();
            return false;
        }
    }

//删除index 删除数据
    public static boolean delete(String url) throws IOException {
        CloseableHttpClient httpClient = getClient();
        HttpDelete http = new HttpDelete(url);
        http.setHeader("Content-Type", "application/json");
        http.setHeader("Authorization", Authorization);
        CloseableHttpResponse response = httpClient.execute(http);
        int statusCode = response.getStatusLine().getStatusCode();
        httpClient.close();
        return statusCode >= 200 && statusCode < 300;
    }

@Override
    public void invoke(DocumentObject documentObject, Context context) {
        try {
            String index = BALANCES[new Random().nextInt(BALANCES.length)] + INDEX_PREFIX + replace(documentObject.getOrgCode());
            if (!EsUtils.isExist(index)) {
                if (EsUtils.createIndex(index, MAPPING)) {
                    LOGGER.info("create index {}.", index);
                }
            }
            String url = documentObject.getFileId();
            if ("D".equals(documentObject.getOperationType())) {
                if (EsUtils.delete(index + "/_doc/" + replace(url))) {
                    LOGGER.info("Data delete to es successfully. {}", url);
                } else {
                    LOGGER.error("Data delete to es failed. {}", url);
                }
            } else {
                CloseableHttpClient client = EsUtils.getClient();
                if (EsUtils.put(client, index + "/_doc/" + replace(url), JSON.toJSONString(documentObject))) {
                    LOGGER.info("Data write to es successfully. {}", url);
                } else {
                    LOGGER.error("Data write to es failed. {}", url);
                }
                client.close();
            }
        } catch (Exception e) {
            LOGGER.error("com.pcitc.zq.es.SinkEsFunction... exception:{}", documentObject.toString(), e);
        }
    }

    @Override
    public void close() throws Exception {
        super.close();
    }

    private String replace(String string) {
        return string
                .replace("/", "_")
                .replace(" ", "_")
                .replace(".", "_")
                .toLowerCase();
    }

注意事项：如果出现index存储大小在不停增长而文档计数很长时间不发生变化。可考虑显示的设置"refresh_interval": "1s"或写入数据的时候设置?refresh或?refresh=true。具体问题具体分析,本项目使用的es版本为7.5.0。

检索

该样例涉及范围查询、多条件组合查询、浅分页、高亮显示。具体功能使用可参照官网获取

【Elasticsearch Guide [8.7] | Elastic】

{

"query": {

    "bool": {

      "must": [

        {

          "range": {

            "uploadTime": {

              "gte": "2022-07-05 12:49:00",

              "lte": "2022-07-30 13:54:00"

            }

          }

        },

        {

          "term": {

            "fileExt.keyword": "DOCX"

          }

        },

        {

          "bool": {

            "should": [

              {

                "match_phrase": {

                  "fileName": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

              {

                "match_phrase": {

                  "orgCode": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

              {

                "match_phrase": {

                  "orgName": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

              {

                "match_phrase": {

                  "resourceId": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

              {

                "match_phrase": {

                  "resourceName": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

             {

                "match_phrase": {

                  "resourcePathId": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

              {

                "match_phrase": {

                  "resourcePathName": {

                    "query": "操作",

                    "boost": 10

                  }

                }

              },

              {

                "match_phrase": {

                  "content": {

                    "query": "解决",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "fileName": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "orgCode": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "orgName": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "resourceId": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "resourceName": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "resourcePathId": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "resourcePathName": {

                    "query": "操作",

                    "boost": 5

                  }

                }

              },

              {

                "match": {

                  "content": {

                    "query": "解决",

                    "boost": 1

                  }

                }

              }

            ]

          }

        }

      ]

    }

},

"size": 10,

"from": 0,

"highlight": {

    "fields": {

      "fileName": {},

      "orgCode": {},

      "orgName": {},

      "resourceId": {},

      "resourceName": {},

      "resourcePathId": {},

      "resourcePathName": {},

      "content": {}

    },

    "pre_tags": "<font color='red'>",

    "post_tags": "</font>",

    "number_of_fragments": 1,

    "fragment_size": 100,

    "no_match_size": 100

}

}