用Tika读取文件(不需要考虑文件格式)

不需要考虑文件格式,用Tika包。

package com.geni_sage.gdme.core.dataReader;

import java.io.*;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;

public class TikaManager {

    private Metadata metadata;

    private String content;

    private boolean isRepalceBlank = false;

    public TikaManager(File file, boolean isReplaceBlank) throws Exception {
        metadata = new Metadata();

        TikaInputStream stream = TikaInputStream.get(file, metadata);
        try {
            Tika tika = new Tika();
            tika.setMaxStringLength(Integer.MAX_VALUE);
            content = tika.parseToString(stream, metadata);

        } finally {
            stream.close();
        }
        this.isRepalceBlank = isReplaceBlank;
    }

    public String getContent() {
        if (isRepalceBlank) {
            return replaceBlank(content);
        } else {
            return content;
        }

    }

    public Metadata getMetadata() {
        return metadata;
    }

    public String getMetadataString() throws Exception {
        return metadataToString();
    }

    private String metadataToString() throws Exception {

        StringBuilder metadataBuffer = new StringBuilder();

        String[] names = metadata.names();
        Arrays.sort(names);
        for (String name : names) {
            metadataBuffer.append(name);
            metadataBuffer.append(": ");
            metadataBuffer.append(metadata.get(name));
            metadataBuffer.append("\n");
        }

        return metadataBuffer.toString();
    }

    private String replaceBlank(String str) {
        String dest = "";
        if (str != null) {
            // Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Pattern p = Pattern.compile("\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("");
        }
        return dest;
    }
}


 

转载于:https://www.cnblogs.com/yuwenfeng/archive/2013/05/15/3080075.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值