第一种直接读取pdf文件获取里面文字
第二种将pdf转成图片识别里面的二维码,获取调用百度图片识别接口。
二维码识别依赖
<dependency>
<groupId>com.google.zxing</groupId>
<artifactId>javase</artifactId>
<version>3.4.0</version>
</dependency>
<dependency>
<groupId>com.google.zxing</groupId>
<artifactId>core</artifactId>
<version>3.4.0</version>
</dependency>
pdf读取所需依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/jempbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>1.8.16</version>
</dependency>
/**
*
* pdf转图片
*
* @author
*/
public static void pdfFileToImage()
{
// pdf文件
File pdffile = new File("C:/Users/Luo-ping/Desktop/顺丰电子发票.pdf");
// 转成的 png 文件存储全路径及文件名
String targetPath = "D:/test.png";
try
{
FileInputStream instream = new FileInputStream(pdffile);
InputStream byteInputStream = null;
PDDocument doc = PDDocument.load(instream);
PDFRenderer renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
if (pageCount > 0)
{
BufferedImage image = renderer.renderImage(0, 2.0f);
image.flush();
ByteArrayOutputStream bs = new ByteArrayOutputStream();
ImageOutputStream imOut;
imOut = ImageIO.createImageOutputStream(bs);
ImageIO.write(image, "png", imOut);
byteInputStream = new ByteArrayInputStream(bs.toByteArray());
byteInputStream.close();
}
File uploadFile = new File(targetPath);
FileOutputStream fops;
fops = new FileOutputStream(uploadFile);
fops.write(readInputStream(byteInputStream));
fops.flush();
fops.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
private static byte[] readInputStream(InputStream inStream) throws Exception
{
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len = 0;
while ((len = inStream.read(buffer)) != -1)
{
outStream.write(buffer, 0, len);
}
inStream.close();
return outStream.toByteArray();
}
/**
*
* 识别图中二维码
*
* @author
* @return
*/
public static String extractImages()
{
String filename = "D:/test.png";
String returnResult = "";
MultiFormatReader multiFormatReader = new MultiFormatReader();
File file = new File(filename);
try
{
BufferedImage image = ImageIO.read(file);
// 定义二维码参数
Map hints = new HashMap();
hints.put(EncodeHintType.CHARACTER_SET, "utf-8");
// 获取读取二维码结果
BinaryBitmap binaryBitmap = new BinaryBitmap(new HybridBinarizer(new BufferedImageLuminanceSource(image)));
Result result = null;
result = multiFormatReader.decode(binaryBitmap, hints);
returnResult = result.getText();
System.err.println(returnResult);
}
catch (Exception e)
{
e.printStackTrace();
}
return returnResult;
}
直接读取pdf
/**
* 读PDF文件,使用了pdfbox开源项目
*
* @param fileName
*
*/
public static void readPDF(String fileName)
{
File file = new File(fileName);
FileInputStream in = null;
try
{
in = new FileInputStream(fileName);
// 新建一个PDF解析器对象
PDFParser parser = new PDFParser(new RandomAccessFile(file, "rw"));
// 对PDF文件进行解析
parser.parse();
// 获取解析后得到的PDF文档对象
PDDocument pdfdocument = parser.getPDDocument();
// 新建一个PDF文本剥离器
PDFTextStripper stripper = new PDFTextStripper();
// 从PDF文档对象中剥离文本
String result = stripper.getText(pdfdocument);
FileWriter fileWriter = new FileWriter(new File("pdf.txt"));
fileWriter.write(result);
fileWriter.flush();
fileWriter.close();
System.out.println("PDF文件的文本内容如下:");
System.out.println(result);
}
catch (Exception e)
{
System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
e.printStackTrace();
}
finally
{
if (in != null)
{
try
{
in.close();
}
catch (IOException e1)
{
}
}
}
}