引入jar
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.13</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.itextpdf/itext-asian -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itext-asian</artifactId>
<version>5.2.0</version>
</dependency>
工具类:
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* @author
* */
public class PdfUtils {
/**
* 按行提取文本
* @param file
* @return List<String>
*/
public static List<String> extractTXTbyLine(String file) {
List<String> listArr = new ArrayList<String>();
try {
PdfReader reader = new PdfReader(file);
int pageNum = reader.getNumberOfPages(); // 获得页数
for (int i = 1; i <= pageNum; i++) { // 只能从第1页开始读
String textFromPageContent = PdfTextExtractor.getTextFromPage(reader, i);
String[] splitArray = textFromPageContent.split("\n");
if (splitArray.length > 0) {
listArr.addAll(Arrays.asList(splitArray));
}
}
} catch (IOException ex) {
Logger.getLogger(PdfUtils.class.getName()).log(Level.SEVERE, null, ex);
}
return listArr;
}
public static void main(String args[]) {
String file = "F:\\文档\\test.pdf";
long startTime = System.currentTimeMillis();
List<String> strings = extractTXTbyLine(file);
for (String s : strings) {
System.out.println(s);
}
long endTime = System.currentTimeMillis();
System.out.println("读写所用时间为:" + (endTime - startTime) + "ms");
}
}