腾讯云OCR（印刷体识别） API使用

最新推荐文章于 2024-05-07 15:48:52 发布

爱球鞋的程序猿

最新推荐文章于 2024-05-07 15:48:52 发布

阅读量922

点赞数

文章标签：腾讯云 java 云计算

本文链接：https://blog.csdn.net/braxuyu4/article/details/128079663

版权

腾讯云OCR（印刷体识别） API使用

使用背景：

业务部门下载了许多PDF的快递单上面有联系方式需要识别手机号码后保存下来

解决思路：

1、如果是可编辑的pdf文档可使用pdf 读取jar包来处理
2、本需求的pdf是图片类型，无法编辑。所以只能把pdf转成image,然后使用ORC的图像识别来完成信息获取

腾讯云的OCR接入：

1、完善账号的实名等级，如果账号是企业的可以免费使用每月1000次的调用量
2、引入maven 依赖查看demo案例

<dependency>
   <groupId>com.tencentcloudapi</groupId>
    <artifactId>tencentcloud-sdk-java-ocr</artifactId>
    <version>3.1.270</version>
</dependency>

将pdf文件转为image图片

public static String pdfToBase64(String pdfUrl){
        InputStream is = null;
        ByteArrayOutputStream os = null;
        String dUrlData="";
        byte[] buff = new byte[1024];
        int len = 0;
        try {
            is = new FileInputStream(new File(pdfUrl));
            os = new ByteArrayOutputStream();
            while ((len = is.read(buff)) != -1) {
                os.write(buff, 0, len);
            }
            os.flush();
            os.toByteArray();
            dUrlData = Base64.getEncoder().encodeToString(os.toByteArray());
        } catch (IOException e) {
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {}
            }
            if (os != null) {
                try {
                    os.close();
                } catch (IOException e) {}
            }
        }
        return pdfToImg(dUrlData);
    }

public static String pdfToImg(String base64){
        String jpg_base64 = null;
        BASE64Decoder decoder = new BASE64Decoder();
        try {
            // Base64解码
            byte[] pdf_bytes = decoder.decodeBuffer(base64);
            PDDocument doc = PDDocument.load(pdf_bytes);
            int size = doc.getNumberOfPages();
            /*图像合并使用的参数*/
            //定义宽度
            int width = 0;
            // 保存一张图片中的RGB数据
            int[] singleImgRGB;
            // 定义高度，后面用于叠加
            int shiftHeight = 0;
            //保存每张图片的像素值
            BufferedImage imageResult = null;
            // 利用PdfBox生成图像
            PDDocument pdDocument = doc;
            PDFRenderer renderer = new PDFRenderer(pdDocument);
            /*根据总页数, 按照50页生成一张长图片的逻辑, 进行拆分*/
            // 每50页转成1张图片
            int pageLength = size; //有多少转多少
            // 总计循环的次数
            int totalCount = pdDocument.getNumberOfPages() / pageLength + 1;
            for (int m = 0; m < totalCount; m++) {
                for (int i = 0; i < pageLength; i++) {
                    int pageIndex = i + (m * pageLength);
                    if (pageIndex == pdDocument.getNumberOfPages()) {
                        break;
                    }
                    // 96为图片的dpi，dpi越大，则图片越清晰，图片越大，转换耗费的时间也越多
                    BufferedImage image = renderer.renderImageWithDPI(pageIndex, 106, ImageType.RGB);
                    int imageHeight = image.getHeight();
                    int imageWidth = image.getWidth();
                    if (i == 0) {
                        //计算高度和偏移量
                        //使用第一张图片宽度;
                        width = imageWidth;
                        // 保存每页图片的像素值
                        // 加个判断：如果m次循环后所剩的图片总数小于pageLength，则图片高度按剩余的张数绘制，否则会出现长图片下面全是黑色的情况
                        if ((pdDocument.getNumberOfPages() - m * pageLength) < pageLength) {
                            imageResult = new BufferedImage(width, imageHeight * (pdDocument.getNumberOfPages() - m * pageLength), BufferedImage.TYPE_INT_RGB);
                        } else {
                            imageResult = new BufferedImage(width, imageHeight * pageLength, BufferedImage.TYPE_INT_RGB);
                        }
                    } else {
                        // 将高度不断累加
                        shiftHeight += imageHeight;
                    }
                    singleImgRGB = image.getRGB(0, 0, width, imageHeight, null, 0, width);
                    imageResult.setRGB(0, shiftHeight, width, imageHeight, singleImgRGB, 0, width);
                }
                // 这个很重要，下面会有说明
                shiftHeight = 0;
            }
            pdDocument.close();
            ByteArrayOutputStream baos = new ByteArrayOutputStream();//io流
            ImageIO.write(imageResult, "png", baos);//写入流中
            byte[] jpg_Bytes = baos.toByteArray();//转换成字节
            BASE64Encoder encoder = new BASE64Encoder();
            jpg_base64 = encoder.encodeBuffer(jpg_Bytes).trim();//转换成base64串
            jpg_base64 = jpg_base64.replaceAll("\n", "").replaceAll("\r", "");//删除 \r\n
            baos.close();
            doc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return jpg_base64;
    }

3、完成pdf的读取

public static String ocrScanPdf(String pdfUrl) throws TencentCloudSDKException {
        Credential cred = new Credential("secretId", "secretkey");
        // 实例化一个http选项，可选的，没有特殊需求可以跳过
        HttpProfile httpProfile = new HttpProfile();
        httpProfile.setEndpoint("ocr.tencentcloudapi.com");
        // 实例化一个client选项，可选的，没有特殊需求可以跳过
        ClientProfile clientProfile = new ClientProfile();
        clientProfile.setHttpProfile(httpProfile);
        // 实例化要请求产品的client对象,clientProfile是可选的
        OcrClient client = new OcrClient(cred, "ap-shanghai", clientProfile);
        // 实例化一个请求对象,每个接口都会对应一个request对象
        GeneralBasicOCRRequest req = new GeneralBasicOCRRequest();
        String image = pdfToBase64(pdfUrl);
        req.setImageBase64(image);
        // 返回的resp是一个GeneralBasicOCRResponse的实例，与请求对象对应
        GeneralBasicOCRResponse resp = client.GeneralBasicOCR(req);
        // 输出json格式的字符串回包
        return GeneralBasicOCRResponse.toJsonString(resp);
    }