Java解析上传文件的内容（比如简历信息的解析）

最新推荐文章于 2024-07-16 05:03:42 发布

Wmenghu

最新推荐文章于 2024-07-16 05:03:42 发布

阅读量2.3k

点赞数 4

分类专栏：后端文章标签： java servlet jvm

本文链接：https://blog.csdn.net/w13966597931/article/details/126405332

版权

后端专栏收录该内容

24 篇文章 0 订阅

订阅专栏

效果应该是这样的（图片借鉴某网站的简历上传）

代码很多，这里记录一下，回头有空了再研究研究

controller

    @ApiOperation(value = "简历解析", notes = "简历解析")
    @PostMapping(Urls.ResumeSelection.uploadFileAddA01)
    @DataLog(operationName = "简历解析", logType = LogTypeEnum.DATA_LOG, methodType = MethodTypeEnum.UPLOAD_TYPE)
    @ApiOperationSupport(order = 29)
    public JsonObject<Object> uploadFileAddA01(@RequestParam(value = "file") MultipartFile[] file, HttpServletRequest request) throws IOException {
        try {
            List<Map<String, Object>> result = new ArrayList<>();
            userDir = fileConfigProperties.getResources();
            // 存放文件目录
            //TODO 此处获取了配置文件信息的值  需在配置文件新增属性
            String folderSources = userDir + File.separator + "temp" + File.separator + "rckfile";
            // 设置日期为文件夹名称
            SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
            String folderName = df.format(new Date()).toString();
            String folderPath = folderSources + File.separator + folderName;
            File rmbFile = new File(folderPath);
            // 文件夹如果不存在就创建该文件夹
            if (!rmbFile.exists()) {
                rmbFile.mkdirs();
            }
            List<File> fileList = new ArrayList();
            for (int i = 0; file != null && i < file.length; i++) {
                // 获得文件名：
                String name = file[i].getOriginalFilename();
                // 获得文件名,不带后缀
//                String filename = name.substring(0, name.lastIndexOf('.'));
                // 获得输入流
                InputStream streamList = null;
                File newPhotoFile = new File(folderPath + File.separator + name);
                OutputStream out = null;
                boolean writeFlag = true;
                try {
                    streamList = file[i].getInputStream();
                    out = new FileOutputStream(newPhotoFile);
                    fileList.add(newPhotoFile);
                    // 保存文件
                    writeFlag = this.write(streamList, out);
                } catch (FileNotFoundException e) {
//                    e.printStackTrace();
                    logger.error(e.getMessage(),e);
                } finally {
                    if (writeFlag && out != null) {
                        out.flush();
                        out.close();
                    }
                }
            }
            // 处理文件夹中文件数据
            result = processService.uploadFileAddA01(fileList, request);
            return new JsonSuccessObject<>(result);
        } catch (IOException | NullPointerException e) {
            logger.error("文件解析失败：" + e.getMessage());
            throw new BusinessException("文件解析失败！");
        }
    }


    public boolean write(InputStream in, OutputStream out) {
        boolean flag = true;
        int BUFSIZE = 65536;
        int s;
        try {
            byte[] buf = new byte[BUFSIZE];
            while ((s = in.read(buf)) > -1) {
                out.write(buf, 0, s);
            }
        } catch (IOException e) {
            flag = false;
//            e.printStackTrace();
            logger.error("异常信息：" + e.getMessage());
        }
        return flag;
    }

Service

List<Map<String, Object>> uploadFileAddA01(List<File> fileList, HttpServletRequest request) throws IOException;

impl

    @Override
    public List<Map<String, Object>> uploadFileAddA01(List<File> fileList, HttpServletRequest request) throws IOException {
        List<Map<String, Object>> result = new ArrayList<>();
        // 循环处理文件
        for (File f : fileList) {
            //文件名称
            String fileName = f.getName();
            //文件类型
            String suffix = fileName.substring(fileName.lastIndexOf('.') + 1);
            String text = "";
            // 解析简历
            if (("doc").equals(suffix) || ("docx").equals(suffix)) {
                try {
                    if (("doc").equals(suffix)) {
                        // 根据文件流获得文档对象
                        InputStream is = new FileInputStream(f);
                        WordExtractor re = new WordExtractor(is);
                        text = re.getText();
                        re.close();
                    } else if (("docx").equals(suffix)) {
                        OPCPackage opcPackage = POIXMLDocument.openPackage(f.getPath());
                        POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
                        text = extractor.getText();
                        extractor.close();
                    }
                } catch (Exception e1) {
//                    e1.printStackTrace();
                    logger.error("异常信息：" + e1.getMessage());
                }
            } else if (("pdf").equals(suffix)) {
                PDDocument document = null;
                try {
                    document = PDDocument.load(f);
                    int pageSize = document.getNumberOfPages();
                    // 一页一页读取
                    if(pageSize <= 50){
                        for (int i = 0; i < pageSize; i++) {
                            // 文本内容
                            PDFTextStripper stripper = new PDFTextStripper();
                            // 设置按顺序输出
                            stripper.setSortByPosition(true);
                            stripper.setStartPage(i + 1);
                            stripper.setEndPage(i + 1);
                            text = text + stripper.getText(document);
                        }
                    }
                } catch (IOException e) {
                    logger.error(e.getMessage(),e);
                } finally {
                    if(document != null){
                        document.close();
                    }
                }
            } else if (("htm").equals(suffix) || ("html").equals(suffix)) {
                // 获取HTML文件流
                StringBuffer htmlSb = new StringBuffer();
                BufferedReader br = null;
                try {
                    br = new BufferedReader(new InputStreamReader(
                            new FileInputStream(f), "utf-8"));
                    while (br.ready()) {
                        htmlSb.append(br.readLine());
                    }
                    br.close();
                } catch (FileNotFoundException e) {
//                    e.printStackTrace();
                    logger.error("异常信息：" + e.getMessage(),e);
                }finally {
                  if(br != null){
                      br.close();
                  }
                }
                // HTML文件字符串
                text = htmlSb.toString();
            } else if (("png").equals(suffix) || ("jpg").equals(suffix) || ("jpeg").equals(suffix)) {
                // post请求 http://192.168.4.188:8869/paddle_ocr
                HttpClient client = new HttpClient();
                PostMethod postMethod = new PostMethod(getocrUrl());
                try {
                    // FilePart：用来上传文件的类,file即要上传的文件
                    FilePart fp = new FilePart("file", f);
                    Part[] parts = {fp};
                    // 对于MIME类型的请求，httpclient建议全用MulitPartRequestEntity进行包装
                    MultipartRequestEntity mre = new MultipartRequestEntity(parts, postMethod.getParams());
                    postMethod.setRequestEntity(mre);
                    // 由于要上传的文件可能比较大 , 因此在此设置最大的连接超时时间
                    client.getHttpConnectionManager().getParams().setConnectionTimeout(50000);
                    int status = client.executeMethod(postMethod);
                    if (status == HttpStatus.SC_OK) {
                        // 获取返回数据
                        InputStream inputStream = postMethod.getResponseBodyAsStream();
//                        BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                        //<1>创建字节数组输出流，用来输出读取到的内容
                        ByteArrayOutputStream baos = new ByteArrayOutputStream();
                        //<2>创建缓存大小
                        byte[] buffer = new byte[1024]; // 1KB
                        //每次读取到内容的长度
                        int len = -1;
                        //<3>开始读取输入流中的内容
                        while ((len = inputStream.read(buffer)) != -1) { //当等于-1说明没有数据可以读取了
                            baos.write(buffer, 0, len);   //把读取到的内容写到输出流中
                        }
                        //<4> 把字节数组转换为字符串
                        String json = baos.toString();
                        // 获取"text": "女，17302731183", "text_region":中间的信息
                        String regEx = "(\"text\": \")(.*?)(\", \"text_region\")";
                        Pattern p = Pattern.compile(regEx);
                        Matcher m = p.matcher(json);
                        while (m.find()) {
                            text = text + m.group(2);
                        }
                        baos.close();
                        inputStream.close();

                    }
                } catch (Exception e) {
//                    e.printStackTrace();
                    logger.error("异常信息：" + e.getMessage(),e);
                } finally {
                    // 释放连接
                    postMethod.releaseConnection();
                }
            } else if (("ppt").equals(suffix) || ("pptx").equals(suffix)) {
                StringBuilder sb = new StringBuilder();
                FileInputStream in = new FileInputStream(f.getPath());
                try {
                    XMLSlideShow xmlSlideShow = new XMLSlideShow(in);
                    List<XSLFSlide> slides = xmlSlideShow.getSlides();
                    for (XSLFSlide slide : slides) {
                        CTSlide rawSlide = slide.getXmlObject();
                        CTGroupShape gs = rawSlide.getCSld().getSpTree();
                        CTShape[] shapes = gs.getSpArray();
                        for (CTShape shape : shapes) {
                            CTTextBody tb = shape.getTxBody();
                            if (null == tb) {
                                continue;
                            }
                            CTTextParagraph[] paras = tb.getPArray();
                            for (CTTextParagraph textParagraph : paras) {
                                CTRegularTextRun[] textRuns = textParagraph.getRArray();
                                for (CTRegularTextRun textRun : textRuns) {
                                    sb.append(textRun.getT());
                                }
                            }
                        }
                    }
                    text = sb.toString();
                    xmlSlideShow.close();
                } catch (Exception e) {
//                    e.printStackTrace();
                    logger.error("异常信息：" + e.getMessage(),e);
                }
                in.close();
            }
            Map<String, Object> saveCandidateVo = new HashMap<>();

            if (StrUtil.isNotBlank(text)) {
                // 清除关键字中的空格
                text = text.replaceFirst("姓[\\s]*名[\\s]*", "姓名").replaceFirst("性[\\s]*别[\\s]*", "性别")
                        .replaceFirst("民[\\s]*族[\\s]*", "民族").replaceFirst("现[\\s]*居[\\s]*住[\\s]*地[\\s]*", "现居住地")
                        .replaceFirst("现[\\s]*居[\\s]*地[\\s]*", "现居地").replaceFirst("居[\\s]*住[\\s]*地[\\s]*", "居住地")
                        .replaceFirst("所[\\s]*在[\\s]*地[\\s]*", "所在地").replaceFirst("现[\\s]*所[\\s]*在[\\s]*地[\\s]*", "现所在地")
                        .replaceFirst("生[\\s]*日[\\s]*", "生日");

                // 提取姓名
                String name = RecA01ServiceImpl.getName(text);
                saveCandidateVo.put("r0101", name);
                //提取性别 需要匹配代码表gb22611
                String sex = getName(text);
                if (StrUtil.isNotBlank(sex)) {
                    String code = processMapper.getDmCodeByDmCpt("gb22611", sex);
                    saveCandidateVo.put("r0102", code);
                } else {
                    saveCandidateVo.put("r0102", null);
                }
                //提取民族
                String nation = getNation(text);
                if (StrUtil.isNotBlank(nation)) {
                    String code = processMapper.getDmCodeByDmCpt("gb3304", nation);
                    saveCandidateVo.put("r0103", code);
                } else {
                    saveCandidateVo.put("r0103", null);
                }
                //提取出生日期
                String birthday = getBirthday(text);
                if (StrUtil.isNotBlank(birthday))
                    birthday = dateFormat(birthday);
                saveCandidateVo.put("r0104", birthday);
                //政治面貌 需要匹配代码表gb4762
                String politicalStatus = getpoliticalStatus(text);
                if (StrUtil.isNotBlank(politicalStatus)) {
                    String code = processMapper.getDmCodeByDmCpt("gb4762", politicalStatus);
                    saveCandidateVo.put("r0105", code);
                } else {
                    saveCandidateVo.put("r0105", null);
                }
                //开始工作年份
                //saveCandidateVo.put("startWorkYear","2021/02/02");
                // 提取手机号
                String phone = RecA01ServiceImpl.getPhoneNo(text);
                if (StrUtil.isNotBlank(phone)) {
                    phone = phone.replaceAll("\\D", "");
                }
                saveCandidateVo.put("r0114", phone);
                // 提取邮箱
                String email = RecA01ServiceImpl.getEmail(text);
                if (StrUtil.isNotBlank(email)) {
                    email = email.replaceAll("\\s", "");
                }
                saveCandidateVo.put("r0115", email);
                //提取时间
//                List<String> dateList = getDate(text);
//                List<String> dateFormatList = new ArrayList<>();
//                if (CollectionUtil.isNotEmpty(dateList)) {
//                    dateList.forEach(v -> {
//                        String format = dateFormat(v);
//                        if (StrUtil.isNotBlank(format)) {
//                            dateFormatList.add(format);
//                        }
//                    });
//                }
//                saveCandidateVo.put("date",dateFormatList);
                //提取现居地
                saveCandidateVo.put("r0110", getCurrentAddress(text));
                //将提取出来的信息，放入结果集
                result.add(saveCandidateVo);
            }

            //姓名加手机号进行排重
            // List<Map<String, Object>> isDuplicateUser = this.checkUserDuplicate(name,phone,resumeflowid,positionid);
            // isDuplicate.addAll(isDuplicateUser);

        }
//        if(isDuplicate.size()!=0){
//            //排重有重复
//            return isDuplicate;
//        }insertintoRybd
        return result;
    }

    //姓名加手机号排重
    public List<RepeatVo> checkUserDuplicate(String name, String phone, String flowId, String positionId) {
        return processMapper.checkUserDuplicate(name, phone, flowId, positionId);
    }

    //提取性别
    public static String getName(String text) {
        String eg = "[^\u4E00-\u9FA5]男|女[^\u4E00-\u9FA5]";
        Pattern pattern = Pattern.compile(eg);
        Matcher sexMatcher = pattern.matcher(text);
        while (sexMatcher.find()) {
            // 提取其中的汉字部分
            String regExp = "[\u4E00-\u9FA5]";
            Pattern sexPattern = Pattern.compile(regExp);
            Matcher matcher = sexPattern.matcher(sexMatcher.group());
            while (matcher.find()) {
                return matcher.group();
            }
        }
        return null;
    }

    //提取民族
    public static String getNation(String text) {
        String eg = "(?<=民族[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{1,9}";
        Pattern sexPattern = Pattern.compile(eg);
        Matcher matcher = sexPattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    //出生日期
    public static String getBirthday(String text) {
        String eg = "(?<=(出生日期|出生年月|生日)[\\S]?[\\s]{0,6})\\d{4}(\\s)*(\\-|\\/|\\.|年)?(\\s)*\\d{1,2}(\\s)*(\\-|\\/|\\.|月|⽉)?((\\s)*\\d{1,2}(\\s)*(日|⽇)?)?";
        Pattern sexPattern = Pattern.compile(eg);
        Matcher matcher = sexPattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    //政治面貌
    public static String getpoliticalStatus(String text) {
        String eg = "(?<=(政治⾯貌|政治面貌)[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{1,9}";
        Pattern sexPattern = Pattern.compile(eg);
        Matcher matcher = sexPattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    //时间日期
/*    public static List<String> getDate(String text) {
        String eg = "\\d{4}(\\-|\\/|\\D)\\d{1,2}(\\-|\\/|\\D)(\\d{1,2}(\\D))?";
        Pattern datePattern = Pattern.compile(eg);
        Matcher matcher = datePattern.matcher(text);
        List<String> dateList = new ArrayList<>();
        while (matcher.find()) {
            dateList.add(matcher.group());
        }
        return dateList;
    }*/

    //时间格式转换为统一格式
    private String dateFormat(String date) {
        if (StrUtil.isNotBlank(date)) {
            date = date.trim();
            // 判断是否为八位纯数字
            if (date.length() == 8){
                boolean matches = date.matches("[0-9]{8}");
                if (matches) {
                    //将时间按照固定格式拼接返回
                    StringBuffer sb = new StringBuffer(date);
                    sb.insert(4,'-');
                    sb.insert(7,'-');
                    return sb.toString();
                }
            }
            date = date.replaceAll(" ","").replaceAll("\\D","-");
            //处理日期最后一位
            char c = date.charAt(date.length() - 1);
            if (!(c >= '0' && c <= '9')) {
                date = date.substring(0, date.length() - 1);
            }
            String[] split = date.split("-");
            //转换纯数字年月：202201 =>2022-01
            if (split[0].length() > 4) {
                StringBuilder sb = new StringBuilder(date);
                sb.insert(4,'-');
                date =  sb.toString();
                split = date.split("-");
            }
            //处理月份为一位的
            if (split[1].length() == 1) {
                split[1] = "0" + split[1];
                String join = Joiner.on("-").join(split);
                return join;
            }
            return date;
        }
        return null;
    }

    //现居地
    public static String getCurrentAddress(String text) {
        String eg = "(?<=(现居住地|现居地|居住地|所在地|现所在地)[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{2,9}";
        Pattern datePattern = Pattern.compile(eg);
        Matcher matcher = datePattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }