案例:有个小需求,批量处理大量PDF文件,处理封面信息,生成一个新的Excel用来统计,PDF文件封面如下图图1,生成Excel如下图图2。
解决:
1、获取本地路径下的所有待处理的pdf文件
private static LinkedList<File> getPDFList(String pathName) {
File file = new File(pathName);
LinkedList<File> list = new LinkedList<>();
LinkedList<File> pdfList = new LinkedList<File>();
if (file.exists()) {
if (null == file.listFiles()) {
if (file.getAbsolutePath().endsWith(".pdf")) pdfList.add(file);
} else {
list.addAll(Arrays.asList(file.listFiles()));
while (!list.isEmpty()) {
File firstF = list.removeFirst();
if (firstF.getAbsolutePath().endsWith(".pdf")) pdfList.add(firstF);
File[] files = firstF.listFiles();
if (null == files) { continue; }
for (File f : files) {
if (f.isDirectory()) {
list.add(f);
} else {
if (f.getAbsolutePath().endsWith(".pdf")) pdfList.add(f);
}
}
}
}
}
return pdfList;
}
2、遍历单个文件的封面,获取封面文字,提取所需文字,封装对象存入列表
private static List<Map<String, String>> getTableList(LinkedList<File> pdfList) {
List<Map<String, String>> mapList = new ArrayList<>();
int fileNum = 1;
for (File f : pdfList) {
try (PDDocument document = PDDocument.load(f)) {
PDPage page = document.getPage(0);
PDDocument newDoc = new PDDocument();
newDoc.addPage(page);
PDFTextStripper stripper = new PDFTextStripper();
String content = stripper.getText(newDoc);
content = replaceBlank(content);
Map<String, String> map = new HashMap<>();
map.put("序号", String.valueOf(fileNum));
map.put("文件名称", f.getName());
map.put("报告编号", getString(content, "报告编号", "委托方"));
map.put("委托方", getString(content, "委托方", "地址"));
// TODO 处理所需字段
mapList.add(map);
fileNum ++;
} catch (IOException e) {
e.printStackTrace();
}
}
return mapList;
}
其中替换识别文字中出现的空格、\t、\n等
private static String replaceBlank(String str) {
String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}
return dest;
}
截图字符串
private static String getString(String content, String first, String last) {
int startIndex = content.indexOf(first);
int endIndex = content.lastIndexOf(last);
return content.substring(startIndex + first.length() + 1, endIndex);
}
3、生成Excel表格
private static XSSFWorkbook getWorkBook(List<Map<String, String>> mapList) {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet();
List<String> columnName = Arrays.asList("序号", "文件名称", "报告编号", "委托方", "地址", "样品名称", "制造厂商", "型号/规格", "出厂编号", "核验员", "检测员", "物品接收日期", "签发日期");
XSSFRow headRow = sheet.createRow(0);
for (int i = 0; i <= columnName.size() - 1; i ++) {
XSSFCell headCel = headRow.createCell(i);
headCel.setCellType(CellType.STRING);
headCel.setCellValue(columnName.get(i));
}
int count = 1;
for (Map<String, String> map : mapList) {
XSSFRow row = sheet.createRow(count);
for (int i = 0; i <= columnName.size() - 1; i ++) {
XSSFCell cell = row.createCell(i);
cell.setCellType(CellType.STRING);
cell.setCellValue(map.get(columnName.get(i)));
}
count ++;
}
return workbook;
}
4、执行
public static void main(String args[]) throws IOException {
LinkedList<File> pdfList = getPDFList("D:\\pdf\\");
List<Map<String, String>> mapList = getTableList(pdfList);
XSSFWorkbook workbook = getWorkBook(mapList);
FileOutputStream outputStream = new FileOutputStream("D:\\pdf\\excel.xlsx");
workbook.write(outputStream);
outputStream.close();
workbook.close();
}