Elasticsearch 如何处理 word pdf ？（Ingest Attachment Processor Plugin）

最新推荐文章于 2024-04-14 22:40:07 发布

h_sn999

最新推荐文章于 2024-04-14 22:40:07 发布

阅读量2.7k

点赞数 1

分类专栏： Elasticsearch java 文章标签： ES elasticsearch java

本文链接：https://blog.csdn.net/h_sn9999/article/details/103097131

版权

java 同时被 2 个专栏收录

64 篇文章 0 订阅

订阅专栏

Elasticsearch

7 篇文章 2 订阅

订阅专栏

本文主要介绍如何使用ES插件将word/pdf等文档导入ES中以及如何使用JAVA API操作写入读取信息

Ingest Attachment Processor Plugin插件允许Elasticsearch通过使用Apache文本提取库Tika提取通用格式（例如PPT，XLS和PDF）的文件附件。

源字段必须是base64编码的二进制。如果不想增加在base64之间来回转换的开销，则可以使用CBOR格式而不是JSON，并将字段指定为字节数组而不是字符串表示形式。然后，处理器将跳过base64解码。

安装

可以使用插件管理器安装此插件：

sudo bin/elasticsearch-plugin install ingest-attachment

该插件必须安装在群集中的每个节点上，并且每个节点必须在安装后重新启动。

可从 https://artifacts.elastic.co/downloads/elasticsearch-plugins/ingest-attachment/ingest-attachment-7.4.2.zip下载此插件以进行脱机安装。

删除

可以使用以下命令删除该插件：

sudo bin/elasticsearch-plugin remove ingest-attachment

在删除插件之前，必须先停止该节点。

创建一个名称为attachment的管道，后面我们在操作文档的时候，需要使用到这个名称

PUT _ingest/pipeline/attachment
{
"description" : "Extract attachment information",
"processors" : [
{
"attachment" : {
"field" : "data"
}
}
]
}

PUT attachment_index/_doc/1?pipeline=attachment
{
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0="
}

GET attachment_index/_doc/1

JAVA代码如下：

package com.start.es;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.nio.client.HttpAsyncClientBuilder;
import org.apache.http.impl.nio.reactor.IOReactorConfig;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.elasticsearch.client.RestHighLevelClient;
import sun.misc.BASE64Encoder;

/**
*
* @author hsn
*
* 实现DOC/PDF/TXT 导入到ES
*
*/
@SuppressWarnings("restriction")
public class Es7TestAttachment {
   public static void main(String[] args) throws IOException {

       RestClientBuilder restClientBuilder = RestClient.builder(new HttpHost("127.0.0.1", 9200))
               .setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
                   @Override
                   public RequestConfig.Builder customizeRequestConfig(RequestConfig.Builder requestConfigBuilder) {
                       return requestConfigBuilder.setConnectTimeout(5000) // 连接超时（默认为1秒）
                               .setSocketTimeout(60000);// 套接字超时（默认为30秒）
                   }
               })
               // .setMaxRetryTimeoutMillis(60000)//调整最大重试超时时间（默认为30秒）
               .setHttpClientConfigCallback(new RestClientBuilder.HttpClientConfigCallback() {
                   @Override
                   public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) {
                       return httpClientBuilder
                               .setDefaultIOReactorConfig(IOReactorConfig.custom().setIoThreadCount(1).build());// 线程数
                   }
               });
       RestHighLevelClient client = new RestHighLevelClient(restClientBuilder);

       // String base64Contentpath = pdfToBase641("d:\\TEST.pdf");
       String base64Contentpath = fileDocToBase64("E:\\新建 DOC 文档.doc");

       // base64Contentpath =
       // "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=";

       Map<String, Object> jsonMap = new HashMap<>();
       jsonMap.put("data", base64Contentpath);
       IndexRequest request = new IndexRequest("attachment_index").setPipeline("attachment")
               .id("9").source(jsonMap);
       IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);

       client.close();
   }


   @SuppressWarnings("restriction")
   public static String UrlDocToBase64(String contentpath) throws IOException {
       InputStream is = null;
       String url = contentpath;
       URL url2 = new URL(url);
       HttpURLConnection conn = (HttpURLConnection) url2.openConnection();
       conn.setDoInput(true);
       conn.connect();
       is = conn.getInputStream();

       byte[] data = null;

       try {
           ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
           byte[] buff = new byte[100];
           int rc = 0;
           while ((rc = is.read(buff, 0, 100)) > 0) {
               swapStream.write(buff, 0, rc);
           }
           data = swapStream.toByteArray();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           if (is != null) {
               try {
                   is.close();
               } catch (IOException e) {
                   e.printStackTrace();
               }
           }
       }
       return new BASE64Encoder().encode(data).replace("\n", "").replace("\r", "");
   }

   @SuppressWarnings("restriction")
   public static String fileDocToBase64(String contentpath) throws IOException {
       InputStream is = null;
       String url = contentpath;
       File file = new File(contentpath);
       is = new FileInputStream(file);

       byte[] data = null;

       try {
           ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
           byte[] buff = new byte[100];
           int rc = 0;
           while ((rc = is.read(buff, 0, 100)) > 0) {
               swapStream.write(buff, 0, rc);
           }
           data = swapStream.toByteArray();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           if (is != null) {
               try {
                   is.close();
               } catch (IOException e) {
                   e.printStackTrace();
               }
           }
       }
       return new BASE64Encoder().encode(data).replace("\n", "").replace("\r", "");
   }

}

Attachment options

Name	Required	Default	Description
`field`	yes	-	The field to get the base64 encoded field from
`target_field`	no	attachment	The field that will hold the attachment information
`indexed_chars`	no	100000	The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit.
`indexed_chars_field`	no	`null`	Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`.
`properties`	no	all properties	Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language`
`ignore_missing`	no	`false`	If `true` and `field` does not exist, the processor quietly exits without modifying the document