Java实现pdf/word文字识别，调用OCR提取图片文字聚合

躺着数星星

于 2024-08-13 23:00:13 发布

阅读量57

点赞数 2

文章标签： java pdf word

本文链接：https://blog.csdn.net/qq_55362295/article/details/141176122

版权

@Tag(name = "pdf/word/图片文字识别")
public class OcrController extends BaseController {


    @Autowired
    private OcrService ocrService;


    @Autowired
    private BaiduOcrServiceImpl baiduOcrService;


    /**
     * pdf/word文字识别
     *
     * @param file
     * @return
     */
    @PostMapping("/recognize-text")
    @Operation(summary = "pdf/word识别文字", description = "识别")
    public String recognizeText(@RequestParam("file") MultipartFile file) {
        return ocrService.recognizeText(file);
    }

}

package com.jt.console.service.impl;

import com.jt.common.beans.ServiceAssert;
import com.jt.console.service.OcrService;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.xwpf.usermodel.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.util.Base64;

import static com.jt.console.service.impl.BaiduOcrServiceImpl.formatOcrResult;

/**
 * pdf/word/图片识别
 * @author chenchao
 * @date 2024/8/12 16:17
 */
@Service
public class OcrServiceImpl implements OcrService {



    @Autowired
    private BaiduOcrServiceImpl baiduOcrService;


    /**
     * 对于一些表格和公式的处理会有识别错乱问题
     * 识别上传文件中的文本内容
     * @param file 上传的文件
     * @return 提取的文本内容或错误信息
     */
    @Override
    public String recognizeText(MultipartFile file) {
        String contentType = file.getContentType();
        if (contentType == null) {
            ServiceAssert.isTrue(false, "文件类型不支持");
            return null;
        }
        InputStream inputStream = null;
        try {
            inputStream = file.getInputStream();
            if (contentType.equals("application/pdf")) {
                return extractTextFromPdf(inputStream);
            } else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||
                    contentType.equals("application/x-tika-ooxml")) {
                return extractTextFromDocx(inputStream);
            } else if (contentType.equals("application/msword")) {
                return extractTextFromDoc(inputStream);
            } else {
                ServiceAssert.isTrue(false, "不支持的文件类型");
                return null;
            }
        } catch (Exception e) {
            e.printStackTrace();
            ServiceAssert.isTrue(false, "处理文件出错");
            return null;
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * 从 PDF 文件中提取文本内容
     * @param inputStream PDF 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromPdf(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        try (PDDocument document = PDDocument.load(inputStream)) {
            // 禁止显示与 CMap 表相关的特定警告
            System.setProperty("org.apache.pdfbox.logging.SILENT", "true");

            PDFTextStripper pdfStripper = new PDFTextStripper();
            text.append(pdfStripper.getText(document));
            // 如果您需要从 PDF 中提取图像，请取消注释下面的行
            // extractImagesFromPdf(document);
        }
        return text.toString();
    }

    /**
     * 从 DOCX 文件中提取文本内容
     * @param inputStream DOCX 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromDocx(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        ZipSecureFile.setMinInflateRatio(0.001); // For safety
        try (XWPFDocument document = new XWPFDocument(inputStream)) {
            // Extract text from paragraphs
            document.getParagraphs().forEach(paragraph -> text.append(paragraph.getText()).append("\n"));

            // Extract text from tables
            for (XWPFTable table : document.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells()) {
                        text.append(cell.getText()).append("\t");
                    }
                    text.append("\n");
                }
            }

            // 如果您需要从 DOCX 中提取图像，请取消注释下面的行
            // extractImagesFromDocx(document);
        }
        return text.toString();
    }

    /**
     * 从 DOC 文件中提取文本内容
     * @param inputStream DOC 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromDoc(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        try (HWPFDocument document = new HWPFDocument(inputStream)) {
            WordExtractor extractor = new WordExtractor(document);
            String[] paragraphs = extractor.getParagraphText();
            for (String paragraph : paragraphs) {
                text.append(paragraph).append("\n");
            }
        }
        return text.toString();
    }




    /**
     * 从 PDF 文件中提取图片
     * @param document PDF 文档对象
     * @throws IOException 读取文件时发生的异常
     */
    private void extractImagesFromPdf(PDDocument document) throws IOException {
        PDPageTree pages = document.getPages();
        int imageCounter = 0;
        for (PDPage page : pages) {
            PDResources resources = page.getResources();
            for (COSName xObjectName : resources.getXObjectNames()) {
                PDXObject xObject = resources.getXObject(xObjectName);
                if (xObject instanceof PDImageXObject) {
                    PDImageXObject image = (PDImageXObject) xObject;
                    BufferedImage bufferedImage = image.getImage();
                    // Save image to file
                    File imageFile = new File("image" + (++imageCounter) + ".png");
                    try (FileOutputStream fos = new FileOutputStream(imageFile)) {
                        ImageIO.write(bufferedImage, "PNG", fos);
                    }
                }
            }
        }
    }




    /**
     * 从 DOCX 文件中提取图片
     * @param document DOCX 文档对象
     * @throws IOException 读取文件时发生的异常
     */
    public String extractImagesFromDocx(XWPFDocument document, boolean urlEncode) throws IOException {
        StringBuilder recognitionResults = new StringBuilder();
        int imageCounter = 0;

        for (XWPFPictureData pictureData : document.getAllPictures()) {
            byte[] bytes = pictureData.getData();
            // 将图片数据转换为 Base64 编码
            String base64Image = Base64.getEncoder().encodeToString(bytes);
            // 如果需要 URL 编码
            if (urlEncode) {
                base64Image = URLEncoder.encode(base64Image, "utf-8");
            }
            // 识别图片
            String ocrResult = baiduOcrService.recognizeImage(base64Image);
            String formattedResult = formatOcrResult(ocrResult);
            recognitionResults.append("Image ").append(++imageCounter).append(": ").append(formattedResult).append("\n");
        }
        return recognitionResults.toString();
    }


}

package com.jt.console.service.impl;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.jt.common.beans.ServiceAssert;
import okhttp3.*;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.Base64;
import java.util.List;
import java.util.Arrays;

/**
 * 百度OCR识别实现类
 */
@Service("baiduOcrServiceImpl")
public class BaiduOcrServiceImpl {

    @Value("${baidu.ocr.apiKey}")
    private String API_KEY;  // 客户端id

    @Value("${baidu.ocr.secretKey}")
    private String SECRET_KEY; // 客户端秘钥

    // 支持的图片格式列表
    private static final List<String> SUPPORTED_FORMATS = Arrays.asList("png", "jpg", "jpeg", "bmp", "gif");

    // 构建 OkHttpClient 实例
    private static final OkHttpClient HTTP_CLIENT = new OkHttpClient().newBuilder().build();

    // 获取 Access Token
    private String getAccessToken() throws IOException {
        MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
        RequestBody body = RequestBody.create(mediaType, "grant_type=client_credentials&client_id=" + API_KEY
                + "&client_secret=" + SECRET_KEY);
        Request request = new Request.Builder()
                .url("https://aip.baidubce.com/oauth/2.0/token")
                .method("POST", body)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .build();
        Response response = HTTP_CLIENT.newCall(request).execute();
        if (!response.isSuccessful()) {
            //throw new IOException("Unexpected code " + response);
            // 自定义提示信息
            String errorMessage = "OCR request failed. Status code: " + response.code() + ", Message: " + response.message();
            ServiceAssert.isTrue(false, errorMessage);
        }
        String responseBody = response.body().string();
        JSONObject jsonObject = JSON.parseObject(responseBody);
        return jsonObject.getString("access_token");
    }

    // 调用 OCR 接口,返回结果
    public String recognizeImage(String base64Image) throws IOException {
        MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
        RequestBody body = RequestBody.create(mediaType, "image=" + base64Image + "&detect_direction=false&paragraph=false&probability=false");
        Request request = new Request.Builder()
                .url("https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + getAccessToken())
                .method("POST", body)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .addHeader("Accept", "application/json")
                .build();
        try (Response response = HTTP_CLIENT.newCall(request).execute()) {
            if (!response.isSuccessful()) {
                //throw new IOException("Unexpected code " + response);
                // 自定义提示信息
                String errorMessage = "Failed to obtain access token. Status code: " + response.code() + ", Message: " + response.message();
                ServiceAssert.isTrue(false, errorMessage);
            }
            return formatOcrResult(response.body().string());
        }
    }

    // 将 MultipartFile 转换为 Base64 编码
    public String convertToBase64(MultipartFile file, boolean urlEncode) throws IOException {
        // 检查图片格式
        String filename = file.getOriginalFilename();
        if (filename == null) {
            ServiceAssert.isTrue(false, "文件名为空");
        }
        String extension = filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();
        if (!SUPPORTED_FORMATS.contains(extension)) {
            ServiceAssert.isTrue(false, "不支持的图片格式: " + extension);
        }
        // 从 MultipartFile 中获取字节数组
        byte[] bytes = file.getBytes();
        // 将字节数组编码为 Base64 字符串
        String base64 = Base64.getEncoder().encodeToString(bytes);
        // 如果需要 URL 编码
        if (urlEncode) {
            base64 = URLEncoder.encode(base64, "utf-8");
        }
        return base64;
    }


    //组装返回OCR识别的结果
    public static String formatOcrResult(String ocrResult) {
        StringBuilder resultText = new StringBuilder();
        try {
            // 解析 OCR 结果
            JSONObject jsonObject = JSON.parseObject(ocrResult);

            // 检查是否包含 words_result 数组
            if (jsonObject.containsKey("words_result")) {
                var wordsResult = jsonObject.getJSONArray("words_result");
                if (wordsResult != null && !wordsResult.isEmpty()) {
                    for (int i = 0; i < wordsResult.size(); i++) {
                        JSONObject wordObject = wordsResult.getJSONObject(i);
                        String word = wordObject.getString("words");
                        if (word != null && !word.isEmpty()) {
                            resultText.append(word).append(" ");
                        }
                    }
                } else {
                    // 如果没有识别到文字，直接返回空值
                    return "";
                }
            } else {
                // OCR 结果中不包含 words_result，也返回空值
                return "";
            }
        } catch (Exception e) {
            ServiceAssert.isTrue(false,e.getMessage());
            //resultText.append("处理 OCR 结果时出错：").append(e.getMessage());
        }
        return resultText.toString().trim();
    }
}