依赖:
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>kernel</artifactId>
<version>7.2.0</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>font-asian</artifactId>
<version>7.2.0</version>
</dependency>
代码:
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.io.InputStream;
@Slf4j
public class PdfUtils {
public static Integer countText(String fileName) throws IOException {
try {
PdfReader reader = new PdfReader(fileName);
return countText(reader);
} catch (IOException e) {
log.error("打开pdf失败,文件名:{}", fileName, e);
throw e;
}
}
public static Integer countText(InputStream inputStream) throws IOException {
try {
PdfReader reader = new PdfReader(inputStream);
return countText(reader);
} catch (IOException e) {
log.error("使用流打开pdf失败", e);
throw e;
}
}
public static Integer countText(PdfReader reader) {
try {
int characterCount = 0;
PdfDocument pdfDoc = new PdfDocument(reader);
int noOfPages = pdfDoc.getNumberOfPages();
System.out.println("Extracted content of PDF---- ");
for (int i = 1; i <= noOfPages; i++) {
String contentOfPage = PdfTextExtractor.getTextFromPage(pdfDoc.getPage(i));
contentOfPage = contentOfPage.replaceAll("\\p{C}", "").replaceAll(" ", "");
characterCount += contentOfPage.length();
log.debug(contentOfPage);
log.debug("这页几个字:" + contentOfPage.length());
}
pdfDoc.close();
log.info("文档总共" + characterCount + "个字");
return characterCount;
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
}
}
}
public static void main(String[] args) throws IOException {
String testPdf = "C:\\Users\\xxx\\Downloads\\test.pdf";
countText(testPdf);
}
}