在ppm.xml中引用pdfbox
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.cicdtest</groupId>
<artifactId>cicdtest</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>cicdtest</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.26</version>
</dependency>
</dependencies>
</project>
package pdf.txt;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDF {
public static void main(String[] args){
String path="";
path = "D:/Project/e-Statement/estatement_pdf/2021_07_09_estatement/8000054710_est_9ec4a09254a67c1690837ef62f64f9e9.pdf";
PDF p = new PDF();
String content= p.Get_PDF_Content(path);
System.out.println(content);
p.save_result_to_txt(content);
}
public String save_result_to_txt (String content) {
String filepath="D:/PDF.txt";
try {
File file = new File(filepath);
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream outStream = new FileOutputStream(file);
outStream.write(content.trim().getBytes());
outStream.close();
} catch (Exception e) {
e.printStackTrace();
}
return filepath;
}
public String save_result_to_txt_with_path (String path) {
String content = Get_PDF_Content(path);
File f = new File(path);
String folder =f.getParentFile().toString();
String file_name =f.getName();
String filepath=folder +"/"+file_name.replace(".pdf", ".txt");
try {
File file = new File(filepath);
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream outStream = new FileOutputStream(file);
outStream.write(content.trim().getBytes());
outStream.close();
} catch (Exception e) {
e.printStackTrace();
}
return filepath;
}
public String read_PDF(String path) {
String test = "";
File file = new File(path);
FileInputStream in = null;
try {
in = new FileInputStream(file);
RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(in);
PDFParser parser = new PDFParser(randomAccessRead);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
test = stripper.getText(pdDocument);
//System.out.println(test);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return test;
}
public String Get_PDF_Content(String path) {
File pdfFile = new File(path);
PDDocument document = null;
String content = "";
try
{
// 方式一:
/**
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
**/
// 方式二:
document=PDDocument.load(pdfFile);
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
content = stripper.getText(document);
//System.out.println(content);
document.close();
//if(content.contains("Reversal")||content.contains("refund")||content.contains("Invaild")) {
//System.out.println("Reversal: "+path);
//}
} catch (Exception e) {
System.out.println(e);
}
return content;
}
}