获取txt pdf doc docx类型文件内容

最新推荐文章于 2024-05-06 09:01:24 发布

soj_1998

最新推荐文章于 2024-05-06 09:01:24 发布

阅读量1.5k

点赞数

本文链接：https://blog.csdn.net/soj_1998/article/details/52836500

版权

用pdfbox poi等jar包读取文件内容

由于文件可能会很大，一次性读取可能会造成内存溢出，所以分块读取。

txt逐行读取

pdf分页读取

doc docx分段读取

文件很大时，分行、分页或分段会有较大数量的行、段等，所以在分块读取时，一次读50段、50行等，

即设置一个分割数，对文件行数进行分割，得到分割后的块数和可能存在余数。

代码如下：

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

public class DocumentUtil {

private static XWPFDocument readDocx(String fileName) throws IOException {
  XWPFDocument xdoc = new XWPFDocument(
    POIXMLDocument.openPackage(fileName));
  return xdoc;
}

private static HWPFDocument readDoc(String fileName) throws IOException {
  InputStream is = new FileInputStream(fileName);
  HWPFDocument xdoc = new HWPFDocument(is);
  return xdoc;
}

private static PDDocument readPdf(String fileName){
  PDDocument pdfD=null;
  try {
   pdfD = PDDocument.load(new File(fileName));
  } catch (IOException e) {
   e.printStackTrace();
  }
  return pdfD;
}

private static BufferedReader readTxt(String fileName) throws IOException {
  BufferedReader brs = new BufferedReader(
    new InputStreamReader(
    new FileInputStream(fileName)));
  return brs;
}

public static int[] paragraphNums(String fileName,int pSeperate) throws IOException{
  int[] result = new int[2];
  int tempR = 0;
  if (fileName.endsWith(".docx")) {
   XWPFDocument rD=readDocx(fileName);
   tempR = rD.getParagraphs().size();
   rD.close();
  } else if (fileName.endsWith(".doc")) {
   tempR = readDoc(fileName).getRange().numParagraphs();
  } else if (fileName.endsWith(".pdf")) {
   PDDocument pdfD=null;
   try {
    pdfD = readPdf(fileName);
    tempR = pdfD.getNumberOfPages();
   }finally{
    try {
     pdfD.close();
    } catch (IOException e) {
     e.printStackTrace();
    }
   }
  } else if (fileName.endsWith(".txt")) {
   String data = null;
         int countLines=0;
         BufferedReader brs = readTxt(fileName);
   while((data = brs.readLine())!=null)
   {
    countLines++;
   }
   data=""+data;
   brs.close();
   tempR = countLines;
  }
  result[0]=tempR%pSeperate;
  result[1]=result[0]==0?tempR/pSeperate:tempR/pSeperate+1;
  return result;
}

public static String paragraphText(String fileName, int pSeperate, int numb) throws IOException{
  int[] rSep=paragraphNums(fileName,pSeperate);
  String result = null;
  StringBuffer sb = new StringBuffer();
  String tempR = null;
  int tPs=0;
  if(rSep[0]==0){
   tPs=pSeperate;
  }else if(rSep[0]>0){
   if(numb<rSep[1]){
    tPs=pSeperate;
   }else if(numb==rSep[1]){
    tPs=rSep[0];
   }
  }
  if (fileName.endsWith(".docx")) {
   XWPFDocument rDocx=readDocx(fileName);
   for(int i=0;i<tPs;i++){
    tempR = rDocx.getParagraphs().get(pSeperate*(numb-1)+i).getText();
    sb.append(tempR);
   }
   rDocx.close();
  } else if (fileName.endsWith(".doc")) {
   HWPFDocument rDoc = readDoc(fileName);
   for(int i=0;i<tPs;i++){
    tempR = rDoc.getRange().getParagraph(pSeperate*(numb-1)+i).text();
    sb.append(tempR);
   }
  } else if (fileName.endsWith(".pdf")) {
   PDFTextStripper textStripper = new PDFTextStripper();
   textStripper.setSortByPosition(true);
   textStripper.setStartPage(pSeperate*(numb-1)+1);
   textStripper.setEndPage(pSeperate*(numb-1)+tPs);
   PDDocument pdfD=null;
   try {
    pdfD = readPdf(fileName);
    sb.append(textStripper.getText(pdfD));
   } finally{
    if(pdfD!=null){
     pdfD.close();
    }
   }
  } else if (fileName.endsWith(".txt")) {
   BufferedReader brs = readTxt(fileName);
         int countL=0;
         String data=null;
         while((data = brs.readLine())!=null)
   {
         countL++;
         if(countL>=pSeperate*(numb-1)+1&&countL<=pSeperate*(numb-1)+tPs){
           if(data!=null){
            sb.append(data);
           }
         }
   }
   brs.close();
  }
  result = sb.toString();
  return result;
}

public static void main(String[] args) throws IOException {
  //按指定数对其进行全本分割
  int pSeperate=20;

  //读取指定类型的文件txt、doc、docx、pdf
  String fileName="E://123.doc";


  //获取该文件分割后最大分割数对doc docx当取最大文本数时循环不能为分割数而是循环余数次
  int i[]=DocumentUtil.paragraphNums(fileName,pSeperate);
  System.out.println("i[0]"+i[0]+"i[1]"+i[1]);

  //获取在最大分割数内的文本
  String r=DocumentUtil.paragraphText(fileName,pSeperate, 1);
  System.out.println("r"+r);
     }
}

soj_1998

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
获取txt pdf doc docx类型文件内容

用pdfbox poi等jar包读取文件内容由于文件可能会很大，一次性读取可能会造成内存溢出，所以分块读取。txt逐行读取pdf分页读取doc docx分段读取文件很大时，分行、分页或分段会有较大数量的行、段等，所以在分块读取时，一次读50段、50行等，即设置一个分割数，对文件行数进行分割，得到分割后的块数和可能存在余数。代码如下：import java
复制链接

扫一扫