import java.io.*;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import com.github.junrar.Archive;
import com.github.junrar.rarfile.FileHeader;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.Tika;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.stereotype.Service;
import org.xml.sax.SAXException;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.zip.ZipFile;
public class RARTool {
public void unrar(String RarPath, String targetPath) throws Exception {
Archive archive = null;
FileOutputStream outputStream = null;
archive = new Archive(new File(RarPath));
FileHeader f = archive.nextFileHeader();
while (f != null) {
// 当前为文件夹,下移
if (f.isDirectory()) {
f = archive.nextFileHeader();
continue;
}
// 判断编码,解决中文乱码的问题
String localpath = f.isUnicode() ? f.getFileNameW() : f.getFileNameString();
// 得到的localpath分隔符为"\",转为为"/"
localpath = targetPath + localpath.replaceAll("\\\\", File.separator);
int end = localpath.lastIndexOf(File.separator);
String dir = localpath;
if (end != -1) {
dir = localpath.substring(0, end);
}
// 需要创建文件夹
File file = new File(dir);
if (!file.exists()) {
file.mkdir();
}
outputStream = new FileOutputStream(localpath);
// archive自己的生成文件的方法
archive.extractFile(f, outputStream);
f = archive.nextFileHeader();
}
outputStream.close();
archive.close();
}
public List<String> getZipText(String sfile) {
List<String> tempString = new ArrayList<String>();
StringBuffer sbf = new StringBuffer();
InputStream input = null;
AutoDetectReader dr = null;
try {
File file = new File(sfile);
// //利用Tika的AutoDetectReader类检测文件的编码格式
dr = new AutoDetectReader(new FileInputStream(file));
String charset = dr.getCharset().name();
System.out.println("********charset********:" +charset);
input = new FileInputStream(file);
// ZipInputStream zip = new ZipInputStream(input);
BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(sfile));
// ZipInputStream zip = new ZipInputStream(bufferedInputStream, Charset.forName("utf-8"));
// 根据检测的编码格式生成文件流
ZipInputStream zip = new ZipInputStream(bufferedInputStream, dr.getCharset());
ZipEntry entry = zip.getNextEntry();
BodyContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
List<String> lfile = new ArrayList<>();
while (entry != null) {
if (entry.isDirectory()) {
System.out.println("****entry=" + entry.getName() + " " + entry.getSize());
} else {
// Scanner sc = new Scanner(entry);
// while (sc.hasNextLine()) {
// System.out.println(sc.nextLine());
// }
// if (entry.getName().endsWith(".txt") || entry.getName().endsWith(".pdf") || entry.getName().endsWith(".docx")) {
System.out.println("####entry=" + entry.getName() + " " + entry.getSize());
lfile.add(entry.getName());
parser.parse(input, textHandler, metadata, new ParseContext());
tempString.add(textHandler.toString());
// } else {
BufferedReader br = new BufferedReader(new InputStreamReader(entry));
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
}
br.close();
// }
}
entry = zip.getNextEntry();
}
zip.close();
input.close();
for (String sfile1 : lfile) {
System.out.println("$$$$$$$:" +sfile1 +"&&&&");
}
for (String text : tempString) {
System.out.println("Apache Tika - Converted input string : " + text);
sbf.append(text);
System.out.println("Final text from all the three files " + sbf.toString());
}
}catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return tempString;
}
}