Java实现pdf/word文字识别,调用OCR提取图片文字聚合

@Tag(name = "pdf/word/图片文字识别")
public class OcrController extends BaseController {


    @Autowired
    private OcrService ocrService;


    @Autowired
    private BaiduOcrServiceImpl baiduOcrService;


    /**
     * pdf/word文字识别
     *
     * @param file
     * @return
     */
    @PostMapping("/recognize-text")
    @Operation(summary = "pdf/word识别文字", description = "识别")
    public String recognizeText(@RequestParam("file") MultipartFile file) {
        return ocrService.recognizeText(file);
    }

}
package com.jt.console.service.impl;

import com.jt.common.beans.ServiceAssert;
import com.jt.console.service.OcrService;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.xwpf.usermodel.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.util.Base64;

import static com.jt.console.service.impl.BaiduOcrServiceImpl.formatOcrResult;

/**
 * pdf/word/图片识别
 * @author chenchao
 * @date 2024/8/12 16:17
 */
@Service
public class OcrServiceImpl implements OcrService {



    @Autowired
    private BaiduOcrServiceImpl baiduOcrService;


    /**
     * 对于一些表格和公式的处理会有识别错乱问题
     * 识别上传文件中的文本内容
     * @param file 上传的文件
     * @return 提取的文本内容或错误信息
     */
    @Override
    public String recognizeText(MultipartFile file) {
        String contentType = file.getContentType();
        if (contentType == null) {
            ServiceAssert.isTrue(false, "文件类型不支持");
            return null;
        }
        InputStream inputStream = null;
        try {
            inputStream = file.getInputStream();
            if (contentType.equals("application/pdf")) {
                return extractTextFromPdf(inputStream);
            } else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||
                    contentType.equals("application/x-tika-ooxml")) {
                return extractTextFromDocx(inputStream);
            } else if (contentType.equals("application/msword")) {
                return extractTextFromDoc(inputStream);
            } else {
                ServiceAssert.isTrue(false, "不支持的文件类型");
                return null;
            }
        } catch (Exception e) {
            e.printStackTrace();
            ServiceAssert.isTrue(false, "处理文件出错");
            return null;
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * 从 PDF 文件中提取文本内容
     * @param inputStream PDF 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromPdf(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        try (PDDocument document = PDDocument.load(inputStream)) {
            // 禁止显示与 CMap 表相关的特定警告
            System.setProperty("org.apache.pdfbox.logging.SILENT", "true");

            PDFTextStripper pdfStripper = new PDFTextStripper();
            text.append(pdfStripper.getText(document));
            // 如果您需要从 PDF 中提取图像,请取消注释下面的行
            // extractImagesFromPdf(document);
        }
        return text.toString();
    }

    /**
     * 从 DOCX 文件中提取文本内容
     * @param inputStream DOCX 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromDocx(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        ZipSecureFile.setMinInflateRatio(0.001); // For safety
        try (XWPFDocument document = new XWPFDocument(inputStream)) {
            // Extract text from paragraphs
            document.getParagraphs().forEach(paragraph -> text.append(paragraph.getText()).append("\n"));

            // Extract text from tables
            for (XWPFTable table : document.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells()) {
                        text.append(cell.getText()).append("\t");
                    }
                    text.append("\n");
                }
            }

            // 如果您需要从 DOCX 中提取图像,请取消注释下面的行
            // extractImagesFromDocx(document);
        }
        return text.toString();
    }

    /**
     * 从 DOC 文件中提取文本内容
     * @param inputStream DOC 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromDoc(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        try (HWPFDocument document = new HWPFDocument(inputStream)) {
            WordExtractor extractor = new WordExtractor(document);
            String[] paragraphs = extractor.getParagraphText();
            for (String paragraph : paragraphs) {
                text.append(paragraph).append("\n");
            }
        }
        return text.toString();
    }




    /**
     * 从 PDF 文件中提取图片
     * @param document PDF 文档对象
     * @throws IOException 读取文件时发生的异常
     */
    private void extractImagesFromPdf(PDDocument document) throws IOException {
        PDPageTree pages = document.getPages();
        int imageCounter = 0;
        for (PDPage page : pages) {
            PDResources resources = page.getResources();
            for (COSName xObjectName : resources.getXObjectNames()) {
                PDXObject xObject = resources.getXObject(xObjectName);
                if (xObject instanceof PDImageXObject) {
                    PDImageXObject image = (PDImageXObject) xObject;
                    BufferedImage bufferedImage = image.getImage();
                    // Save image to file
                    File imageFile = new File("image" + (++imageCounter) + ".png");
                    try (FileOutputStream fos = new FileOutputStream(imageFile)) {
                        ImageIO.write(bufferedImage, "PNG", fos);
                    }
                }
            }
        }
    }




    /**
     * 从 DOCX 文件中提取图片
     * @param document DOCX 文档对象
     * @throws IOException 读取文件时发生的异常
     */
    public String extractImagesFromDocx(XWPFDocument document, boolean urlEncode) throws IOException {
        StringBuilder recognitionResults = new StringBuilder();
        int imageCounter = 0;

        for (XWPFPictureData pictureData : document.getAllPictures()) {
            byte[] bytes = pictureData.getData();
            // 将图片数据转换为 Base64 编码
            String base64Image = Base64.getEncoder().encodeToString(bytes);
            // 如果需要 URL 编码
            if (urlEncode) {
                base64Image = URLEncoder.encode(base64Image, "utf-8");
            }
            // 识别图片
            String ocrResult = baiduOcrService.recognizeImage(base64Image);
            String formattedResult = formatOcrResult(ocrResult);
            recognitionResults.append("Image ").append(++imageCounter).append(": ").append(formattedResult).append("\n");
        }
        return recognitionResults.toString();
    }


}

package com.jt.console.service.impl;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.jt.common.beans.ServiceAssert;
import okhttp3.*;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.Base64;
import java.util.List;
import java.util.Arrays;

/**
 * 百度OCR识别实现类
 */
@Service("baiduOcrServiceImpl")
public class BaiduOcrServiceImpl {

    @Value("${baidu.ocr.apiKey}")
    private String API_KEY;  // 客户端id

    @Value("${baidu.ocr.secretKey}")
    private String SECRET_KEY; // 客户端秘钥

    // 支持的图片格式列表
    private static final List<String> SUPPORTED_FORMATS = Arrays.asList("png", "jpg", "jpeg", "bmp", "gif");

    // 构建 OkHttpClient 实例
    private static final OkHttpClient HTTP_CLIENT = new OkHttpClient().newBuilder().build();

    // 获取 Access Token
    private String getAccessToken() throws IOException {
        MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
        RequestBody body = RequestBody.create(mediaType, "grant_type=client_credentials&client_id=" + API_KEY
                + "&client_secret=" + SECRET_KEY);
        Request request = new Request.Builder()
                .url("https://aip.baidubce.com/oauth/2.0/token")
                .method("POST", body)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .build();
        Response response = HTTP_CLIENT.newCall(request).execute();
        if (!response.isSuccessful()) {
            //throw new IOException("Unexpected code " + response);
            // 自定义提示信息
            String errorMessage = "OCR request failed. Status code: " + response.code() + ", Message: " + response.message();
            ServiceAssert.isTrue(false, errorMessage);
        }
        String responseBody = response.body().string();
        JSONObject jsonObject = JSON.parseObject(responseBody);
        return jsonObject.getString("access_token");
    }

    // 调用 OCR 接口,返回结果
    public String recognizeImage(String base64Image) throws IOException {
        MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
        RequestBody body = RequestBody.create(mediaType, "image=" + base64Image + "&detect_direction=false&paragraph=false&probability=false");
        Request request = new Request.Builder()
                .url("https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + getAccessToken())
                .method("POST", body)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .addHeader("Accept", "application/json")
                .build();
        try (Response response = HTTP_CLIENT.newCall(request).execute()) {
            if (!response.isSuccessful()) {
                //throw new IOException("Unexpected code " + response);
                // 自定义提示信息
                String errorMessage = "Failed to obtain access token. Status code: " + response.code() + ", Message: " + response.message();
                ServiceAssert.isTrue(false, errorMessage);
            }
            return formatOcrResult(response.body().string());
        }
    }

    // 将 MultipartFile 转换为 Base64 编码
    public String convertToBase64(MultipartFile file, boolean urlEncode) throws IOException {
        // 检查图片格式
        String filename = file.getOriginalFilename();
        if (filename == null) {
            ServiceAssert.isTrue(false, "文件名为空");
        }
        String extension = filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();
        if (!SUPPORTED_FORMATS.contains(extension)) {
            ServiceAssert.isTrue(false, "不支持的图片格式: " + extension);
        }
        // 从 MultipartFile 中获取字节数组
        byte[] bytes = file.getBytes();
        // 将字节数组编码为 Base64 字符串
        String base64 = Base64.getEncoder().encodeToString(bytes);
        // 如果需要 URL 编码
        if (urlEncode) {
            base64 = URLEncoder.encode(base64, "utf-8");
        }
        return base64;
    }


    //组装返回OCR识别的结果
    public static String formatOcrResult(String ocrResult) {
        StringBuilder resultText = new StringBuilder();
        try {
            // 解析 OCR 结果
            JSONObject jsonObject = JSON.parseObject(ocrResult);

            // 检查是否包含 words_result 数组
            if (jsonObject.containsKey("words_result")) {
                var wordsResult = jsonObject.getJSONArray("words_result");
                if (wordsResult != null && !wordsResult.isEmpty()) {
                    for (int i = 0; i < wordsResult.size(); i++) {
                        JSONObject wordObject = wordsResult.getJSONObject(i);
                        String word = wordObject.getString("words");
                        if (word != null && !word.isEmpty()) {
                            resultText.append(word).append(" ");
                        }
                    }
                } else {
                    // 如果没有识别到文字,直接返回空值
                    return "";
                }
            } else {
                // OCR 结果中不包含 words_result,也返回空值
                return "";
            }
        } catch (Exception e) {
            ServiceAssert.isTrue(false,e.getMessage());
            //resultText.append("处理 OCR 结果时出错:").append(e.getMessage());
        }
        return resultText.toString().trim();
    }
}

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要使用华为云的文字识别服务,您可以使用Java SDK进行开发。下面是一些快速入门教程的步骤。 1. 创建华为云账号 首先,您需要在华为云上注册一个账号,并创建一个新的项目。 2. 申请并开通OCR服务 在您的华为云项目中,找到并申请OCR服务。申请成功后,您需要开通OCR服务,获取相应的Access Key ID和Access Key Secret。 3. 下载并安装Java SDK 您可以从华为云官网下载Java SDK,然后按照安装指南进行安装。 4. 配置java SDK 在您的Java项目中,添加Java SDK的依赖项,并将Access Key ID和Access Key Secret配置为环境变量。 5. 调用OCR服务 您可以使用Java SDK提供的API来调用OCR服务。以下是一个示例代码: ```java import com.huaweicloud.sdk.core.auth.BasicCredentials; import com.huaweicloud.sdk.core.exception.SdkException; import com.huaweicloud.sdk.ocr.v1.OcrClient; import com.huaweicloud.sdk.ocr.v1.model.RecognizeBusinessCardRequest; import com.huaweicloud.sdk.ocr.v1.model.RecognizeBusinessCardResponse; public class OCRDemo { public static void main(String[] args) { String ak = "your access key"; String sk = "your secret key"; BasicCredentials auth = new BasicCredentials() .withAk(ak) .withSk(sk); OcrClient ocrClient = OcrClient.newBuilder() .withCredential(auth) .withRegion("cn-north-4") .build(); RecognizeBusinessCardRequest request = new RecognizeBusinessCardRequest() .withImageBase64("base64-encoded image"); try { RecognizeBusinessCardResponse response = ocrClient.recognizeBusinessCard(request); System.out.println(response.getResult().getAddress()); System.out.println(response.getResult().getCompanyName()); System.out.println(response.getResult().getJobTitle()); } catch (SdkException e) { e.printStackTrace(); } } } ``` 以上代码使用RecognizeBusinessCard API来识别名片上的信息。您需要将Access Key ID和Access Key Secret替换为您自己的信息,并使用正确的区域代码。 6. 运行代码 最后,您可以运行代码,测试OCR服务是否正常工作。如果一切顺利,您应该能够从API响应中获取到识别的名片信息。 这是一个简单的Java OCR示例。如果您需要其他类型的OCR服务,请参考华为云的OCR开发指南。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值