tika
进行文件文档内容提取
maven依赖:
<!-- 提取内容 -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.26</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.26</version>
</dependency>
<!-- lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
</dependency>
工具类:
package com.ly.test.utils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
import org.apache.tika.sax.BodyContentHandler;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
/**
* Tika
*
* 内容提取工具类
*
*/
@Slf4j
public class TikaUtils {
public static String parseContent(File f) {
String content = null;
try {
InputStream stream = FileUtils.openInputStream(f);
content = parseContent(stream);
if (null == content || "".equals(content)) {
content = parseTxt(f);
}
if (content != null) {
return content;
}
} catch (Exception e) {
log.error("tika parse error", e);
}
return content;
}
private static String parseTxt(File file) throws IOException {
InputStream stream1 = FileUtils.openInputStream(file);
EncodingDetector detector = new UniversalEncodingDetector();
Charset charset = detector.detect(new BufferedInputStream(stream1), new Metadata());
stream1.close();
if (charset != null) {
return FileUtils.readFileToString(file, charset);
} else {
return null;
}
}
public static String parseContent(InputStream stream) {
String content = null;
try {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
parser.parse(stream, handler, metadata);
content = handler.toString();
} catch (Exception e) {
log.error("tika parse error", e);
}
return content;
}
}
使用:
@PostMapping("/insert")
public String insert(MultipartFile file) throws Exception {
String content = TikaUtils.parseContent(file.getInputStream());
System.out.println(content);
return content;
}
测试md文件: