public static List<String> getPDFContentWithBold(String filepath) throws IOException {
List<String> boldTexts = new ArrayList<>(); // List to store bold texts
File file = new File(filepath);
PDDocument document = PDDocument.load(file);
String tarGetsRegEx = "([A-Za-z]+)\\s+(\\d+)";// The tarGets regular expression
System.out.println("tarGetsRegEx: " + tarGetsRegEx);
PDFTextStripper reader = new PDFTextStripper() {
StringBuilder line = new StringBuilder(); // For keeping track of the current line
float prevY = -1; // For keeping track of Y coordinate
float prevX = -1; // For keeping track of X coordinate
Pattern pattern = Pattern.compile(tarGetsRegEx); // The regular expression
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (TextPosition text : textPositions) {
String baseFont = text.getFont().getName();
// Check if we've moved to a new line
if (prevY != -1 && text.getY() != prevY) {
String boldTextLine = line.toString().trim();
if (!boldTextLine.isEmpty()) {
Matcher matcher = pattern.matcher(boldTextLine);
while(matcher.find()) { // find each match in the line
boldTexts.add(matcher.group());
}
}
line.setLength(0); // Clear the line
}
// Check if we need to insert a space
if (prevX != -1 && text.getX() - prevX > 5f) { // Possible space, adjust the value as needed
line.append(' ');
}
// Detect bold text
if (baseFont.contains("Bold")) {
line.append(text.getUnicode());
}
prevY = text.getY();
prevX = text.getEndX();
}
}
};
reader.writeText(document, new OutputStreamWriter(System.out));
return boldTexts;
}
推荐的 正则表达式测试工具
个人公主号