用pdfbox poi等jar包读取文件内容
由于文件可能会很大,一次性读取可能会造成内存溢出,所以分块读取。
txt逐行读取
pdf分页读取
doc docx分段读取
文件很大时,分行、分页或分段会有较大数量的行、段等,所以在分块读取时,一次读50段、50行等,
即设置一个分割数,对文件行数进行分割,得到分割后的块数和可能存在余数。
代码如下:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class DocumentUtil {
private static XWPFDocument readDocx(String fileName) throws IOException {
XWPFDocument xdoc = new XWPFDocument(
POIXMLDocument.openPackage(fileName));
return xdoc;
}
private static HWPFDocument readDoc(String fileName) throws IOException {
InputStream is = new FileInputStream(fileName);
HWPFDocument xdoc = new HWPFDocument(is);
return xdoc;
}
private static PDDocument readPdf(String fileName){
PDDocument pdfD=null;
try {
pdfD = PDDocument.load(new File(fileName));
} catch (IOException e) {
e.printStackTrace();
}
return pdfD;
}
private static BufferedReader readTxt(String fileName) throws IOException {
BufferedReader brs = new BufferedReader(
new InputStreamReader(
new FileInputStream(fileName)));
return brs;
}
public static int[] paragraphNums(String fileName,int pSeperate) throws IOException{
int[] result = new int[2];
int tempR = 0;
if (fileName.endsWith(".docx")) {
XWPFDocument rD=readDocx(fileName);
tempR = rD.getParagraphs().size();
rD.close();
} else if (fileName.endsWith(".doc")) {
tempR = readDoc(fileName).getRange().numParagraphs();
} else if (fileName.endsWith(".pdf")) {
PDDocument pdfD=null;
try {
pdfD = readPdf(fileName);
tempR = pdfD.getNumberOfPages();
}finally{
try {
pdfD.close();
} catch (IOException e) {
e.printStackTrace();
}
}
} else if (fileName.endsWith(".txt")) {
String data = null;
int countLines=0;
BufferedReader brs = readTxt(fileName);
while((data = brs.readLine())!=null)
{
countLines++;
}
data=""+data;
brs.close();
tempR = countLines;
}
result[0]=tempR%pSeperate;
result[1]=result[0]==0?tempR/pSeperate:tempR/pSeperate+1;
return result;
}
public static String paragraphText(String fileName, int pSeperate, int numb) throws IOException{
int[] rSep=paragraphNums(fileName,pSeperate);
String result = null;
StringBuffer sb = new StringBuffer();
String tempR = null;
int tPs=0;
if(rSep[0]==0){
tPs=pSeperate;
}else if(rSep[0]>0){
if(numb<rSep[1]){
tPs=pSeperate;
}else if(numb==rSep[1]){
tPs=rSep[0];
}
}
if (fileName.endsWith(".docx")) {
XWPFDocument rDocx=readDocx(fileName);
for(int i=0;i<tPs;i++){
tempR = rDocx.getParagraphs().get(pSeperate*(numb-1)+i).getText();
sb.append(tempR);
}
rDocx.close();
} else if (fileName.endsWith(".doc")) {
HWPFDocument rDoc = readDoc(fileName);
for(int i=0;i<tPs;i++){
tempR = rDoc.getRange().getParagraph(pSeperate*(numb-1)+i).text();
sb.append(tempR);
}
} else if (fileName.endsWith(".pdf")) {
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setSortByPosition(true);
textStripper.setStartPage(pSeperate*(numb-1)+1);
textStripper.setEndPage(pSeperate*(numb-1)+tPs);
PDDocument pdfD=null;
try {
pdfD = readPdf(fileName);
sb.append(textStripper.getText(pdfD));
} finally{
if(pdfD!=null){
pdfD.close();
}
}
} else if (fileName.endsWith(".txt")) {
BufferedReader brs = readTxt(fileName);
int countL=0;
String data=null;
while((data = brs.readLine())!=null)
{
countL++;
if(countL>=pSeperate*(numb-1)+1&&countL<=pSeperate*(numb-1)+tPs){
if(data!=null){
sb.append(data);
}
}
}
brs.close();
}
result = sb.toString();
return result;
}
public static void main(String[] args) throws IOException {
//按指定数对其进行全本分割
int pSeperate=20;
//读取指定类型的文件txt、doc、docx、pdf
String fileName="E://123.doc";
//获取该文件分割后最大分割数 对doc docx当取最大文本数时 循环不能为分割数 而是循环余数次
int i[]=DocumentUtil.paragraphNums(fileName,pSeperate);
System.out.println("i[0]"+i[0]+"i[1]"+i[1]);
//获取在最大分割数内的文本
String r=DocumentUtil.paragraphText(fileName,pSeperate, 1);
System.out.println("r"+r);
}
}