Java解析上传文件的内容(比如简历信息的解析)

效果应该是这样的(图片借鉴某网站的简历上传)

代码很多,这里记录一下,回头有空了再研究研究

 controller

    @ApiOperation(value = "简历解析", notes = "简历解析")
    @PostMapping(Urls.ResumeSelection.uploadFileAddA01)
    @DataLog(operationName = "简历解析", logType = LogTypeEnum.DATA_LOG, methodType = MethodTypeEnum.UPLOAD_TYPE)
    @ApiOperationSupport(order = 29)
    public JsonObject<Object> uploadFileAddA01(@RequestParam(value = "file") MultipartFile[] file, HttpServletRequest request) throws IOException {
        try {
            List<Map<String, Object>> result = new ArrayList<>();
            userDir = fileConfigProperties.getResources();
            // 存放文件目录
            //TODO 此处获取了配置文件信息的值  需在配置文件新增属性
            String folderSources = userDir + File.separator + "temp" + File.separator + "rckfile";
            // 设置日期为文件夹名称
            SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
            String folderName = df.format(new Date()).toString();
            String folderPath = folderSources + File.separator + folderName;
            File rmbFile = new File(folderPath);
            // 文件夹如果不存在就创建该文件夹
            if (!rmbFile.exists()) {
                rmbFile.mkdirs();
            }
            List<File> fileList = new ArrayList();
            for (int i = 0; file != null && i < file.length; i++) {
                // 获得文件名:
                String name = file[i].getOriginalFilename();
                // 获得文件名,不带后缀
//                String filename = name.substring(0, name.lastIndexOf('.'));
                // 获得输入流
                InputStream streamList = null;
                File newPhotoFile = new File(folderPath + File.separator + name);
                OutputStream out = null;
                boolean writeFlag = true;
                try {
                    streamList = file[i].getInputStream();
                    out = new FileOutputStream(newPhotoFile);
                    fileList.add(newPhotoFile);
                    // 保存文件
                    writeFlag = this.write(streamList, out);
                } catch (FileNotFoundException e) {
//                    e.printStackTrace();
                    logger.error(e.getMessage(),e);
                } finally {
                    if (writeFlag && out != null) {
                        out.flush();
                        out.close();
                    }
                }
            }
            // 处理文件夹中文件数据
            result = processService.uploadFileAddA01(fileList, request);
            return new JsonSuccessObject<>(result);
        } catch (IOException | NullPointerException e) {
            logger.error("文件解析失败:" + e.getMessage());
            throw new BusinessException("文件解析失败!");
        }
    }


    public boolean write(InputStream in, OutputStream out) {
        boolean flag = true;
        int BUFSIZE = 65536;
        int s;
        try {
            byte[] buf = new byte[BUFSIZE];
            while ((s = in.read(buf)) > -1) {
                out.write(buf, 0, s);
            }
        } catch (IOException e) {
            flag = false;
//            e.printStackTrace();
            logger.error("异常信息:" + e.getMessage());
        }
        return flag;
    }

 Service

List<Map<String, Object>> uploadFileAddA01(List<File> fileList, HttpServletRequest request) throws IOException;

impl

    @Override
    public List<Map<String, Object>> uploadFileAddA01(List<File> fileList, HttpServletRequest request) throws IOException {
        List<Map<String, Object>> result = new ArrayList<>();
        // 循环处理文件
        for (File f : fileList) {
            //文件名称
            String fileName = f.getName();
            //文件类型
            String suffix = fileName.substring(fileName.lastIndexOf('.') + 1);
            String text = "";
            // 解析简历
            if (("doc").equals(suffix) || ("docx").equals(suffix)) {
                try {
                    if (("doc").equals(suffix)) {
                        // 根据文件流获得文档对象
                        InputStream is = new FileInputStream(f);
                        WordExtractor re = new WordExtractor(is);
                        text = re.getText();
                        re.close();
                    } else if (("docx").equals(suffix)) {
                        OPCPackage opcPackage = POIXMLDocument.openPackage(f.getPath());
                        POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
                        text = extractor.getText();
                        extractor.close();
                    }
                } catch (Exception e1) {
//                    e1.printStackTrace();
                    logger.error("异常信息:" + e1.getMessage());
                }
            } else if (("pdf").equals(suffix)) {
                PDDocument document = null;
                try {
                    document = PDDocument.load(f);
                    int pageSize = document.getNumberOfPages();
                    // 一页一页读取
                    if(pageSize <= 50){
                        for (int i = 0; i < pageSize; i++) {
                            // 文本内容
                            PDFTextStripper stripper = new PDFTextStripper();
                            // 设置按顺序输出
                            stripper.setSortByPosition(true);
                            stripper.setStartPage(i + 1);
                            stripper.setEndPage(i + 1);
                            text = text + stripper.getText(document);
                        }
                    }
                } catch (IOException e) {
                    logger.error(e.getMessage(),e);
                } finally {
                    if(document != null){
                        document.close();
                    }
                }
            } else if (("htm").equals(suffix) || ("html").equals(suffix)) {
                // 获取HTML文件流
                StringBuffer htmlSb = new StringBuffer();
                BufferedReader br = null;
                try {
                    br = new BufferedReader(new InputStreamReader(
                            new FileInputStream(f), "utf-8"));
                    while (br.ready()) {
                        htmlSb.append(br.readLine());
                    }
                    br.close();
                } catch (FileNotFoundException e) {
//                    e.printStackTrace();
                    logger.error("异常信息:" + e.getMessage(),e);
                }finally {
                  if(br != null){
                      br.close();
                  }
                }
                // HTML文件字符串
                text = htmlSb.toString();
            } else if (("png").equals(suffix) || ("jpg").equals(suffix) || ("jpeg").equals(suffix)) {
                // post请求 http://192.168.4.188:8869/paddle_ocr
                HttpClient client = new HttpClient();
                PostMethod postMethod = new PostMethod(getocrUrl());
                try {
                    // FilePart:用来上传文件的类,file即要上传的文件
                    FilePart fp = new FilePart("file", f);
                    Part[] parts = {fp};
                    // 对于MIME类型的请求,httpclient建议全用MulitPartRequestEntity进行包装
                    MultipartRequestEntity mre = new MultipartRequestEntity(parts, postMethod.getParams());
                    postMethod.setRequestEntity(mre);
                    // 由于要上传的文件可能比较大 , 因此在此设置最大的连接超时时间
                    client.getHttpConnectionManager().getParams().setConnectionTimeout(50000);
                    int status = client.executeMethod(postMethod);
                    if (status == HttpStatus.SC_OK) {
                        // 获取返回数据
                        InputStream inputStream = postMethod.getResponseBodyAsStream();
//                        BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                        //<1>创建字节数组输出流,用来输出读取到的内容
                        ByteArrayOutputStream baos = new ByteArrayOutputStream();
                        //<2>创建缓存大小
                        byte[] buffer = new byte[1024]; // 1KB
                        //每次读取到内容的长度
                        int len = -1;
                        //<3>开始读取输入流中的内容
                        while ((len = inputStream.read(buffer)) != -1) { //当等于-1说明没有数据可以读取了
                            baos.write(buffer, 0, len);   //把读取到的内容写到输出流中
                        }
                        //<4> 把字节数组转换为字符串
                        String json = baos.toString();
                        // 获取"text": "女,17302731183", "text_region":中间的信息
                        String regEx = "(\"text\": \")(.*?)(\", \"text_region\")";
                        Pattern p = Pattern.compile(regEx);
                        Matcher m = p.matcher(json);
                        while (m.find()) {
                            text = text + m.group(2);
                        }
                        baos.close();
                        inputStream.close();

                    }
                } catch (Exception e) {
//                    e.printStackTrace();
                    logger.error("异常信息:" + e.getMessage(),e);
                } finally {
                    // 释放连接
                    postMethod.releaseConnection();
                }
            } else if (("ppt").equals(suffix) || ("pptx").equals(suffix)) {
                StringBuilder sb = new StringBuilder();
                FileInputStream in = new FileInputStream(f.getPath());
                try {
                    XMLSlideShow xmlSlideShow = new XMLSlideShow(in);
                    List<XSLFSlide> slides = xmlSlideShow.getSlides();
                    for (XSLFSlide slide : slides) {
                        CTSlide rawSlide = slide.getXmlObject();
                        CTGroupShape gs = rawSlide.getCSld().getSpTree();
                        CTShape[] shapes = gs.getSpArray();
                        for (CTShape shape : shapes) {
                            CTTextBody tb = shape.getTxBody();
                            if (null == tb) {
                                continue;
                            }
                            CTTextParagraph[] paras = tb.getPArray();
                            for (CTTextParagraph textParagraph : paras) {
                                CTRegularTextRun[] textRuns = textParagraph.getRArray();
                                for (CTRegularTextRun textRun : textRuns) {
                                    sb.append(textRun.getT());
                                }
                            }
                        }
                    }
                    text = sb.toString();
                    xmlSlideShow.close();
                } catch (Exception e) {
//                    e.printStackTrace();
                    logger.error("异常信息:" + e.getMessage(),e);
                }
                in.close();
            }
            Map<String, Object> saveCandidateVo = new HashMap<>();

            if (StrUtil.isNotBlank(text)) {
                // 清除关键字中的空格
                text = text.replaceFirst("姓[\\s]*名[\\s]*", "姓名").replaceFirst("性[\\s]*别[\\s]*", "性别")
                        .replaceFirst("民[\\s]*族[\\s]*", "民族").replaceFirst("现[\\s]*居[\\s]*住[\\s]*地[\\s]*", "现居住地")
                        .replaceFirst("现[\\s]*居[\\s]*地[\\s]*", "现居地").replaceFirst("居[\\s]*住[\\s]*地[\\s]*", "居住地")
                        .replaceFirst("所[\\s]*在[\\s]*地[\\s]*", "所在地").replaceFirst("现[\\s]*所[\\s]*在[\\s]*地[\\s]*", "现所在地")
                        .replaceFirst("生[\\s]*日[\\s]*", "生日");

                // 提取姓名
                String name = RecA01ServiceImpl.getName(text);
                saveCandidateVo.put("r0101", name);
                //提取性别 需要匹配代码表gb22611
                String sex = getName(text);
                if (StrUtil.isNotBlank(sex)) {
                    String code = processMapper.getDmCodeByDmCpt("gb22611", sex);
                    saveCandidateVo.put("r0102", code);
                } else {
                    saveCandidateVo.put("r0102", null);
                }
                //提取民族
                String nation = getNation(text);
                if (StrUtil.isNotBlank(nation)) {
                    String code = processMapper.getDmCodeByDmCpt("gb3304", nation);
                    saveCandidateVo.put("r0103", code);
                } else {
                    saveCandidateVo.put("r0103", null);
                }
                //提取出生日期
                String birthday = getBirthday(text);
                if (StrUtil.isNotBlank(birthday))
                    birthday = dateFormat(birthday);
                saveCandidateVo.put("r0104", birthday);
                //政治面貌 需要匹配代码表gb4762
                String politicalStatus = getpoliticalStatus(text);
                if (StrUtil.isNotBlank(politicalStatus)) {
                    String code = processMapper.getDmCodeByDmCpt("gb4762", politicalStatus);
                    saveCandidateVo.put("r0105", code);
                } else {
                    saveCandidateVo.put("r0105", null);
                }
                //开始工作年份
                //saveCandidateVo.put("startWorkYear","2021/02/02");
                // 提取手机号
                String phone = RecA01ServiceImpl.getPhoneNo(text);
                if (StrUtil.isNotBlank(phone)) {
                    phone = phone.replaceAll("\\D", "");
                }
                saveCandidateVo.put("r0114", phone);
                // 提取邮箱
                String email = RecA01ServiceImpl.getEmail(text);
                if (StrUtil.isNotBlank(email)) {
                    email = email.replaceAll("\\s", "");
                }
                saveCandidateVo.put("r0115", email);
                //提取时间
//                List<String> dateList = getDate(text);
//                List<String> dateFormatList = new ArrayList<>();
//                if (CollectionUtil.isNotEmpty(dateList)) {
//                    dateList.forEach(v -> {
//                        String format = dateFormat(v);
//                        if (StrUtil.isNotBlank(format)) {
//                            dateFormatList.add(format);
//                        }
//                    });
//                }
//                saveCandidateVo.put("date",dateFormatList);
                //提取现居地
                saveCandidateVo.put("r0110", getCurrentAddress(text));
                //将提取出来的信息,放入结果集
                result.add(saveCandidateVo);
            }

            //姓名加手机号进行排重
            // List<Map<String, Object>> isDuplicateUser = this.checkUserDuplicate(name,phone,resumeflowid,positionid);
            // isDuplicate.addAll(isDuplicateUser);

        }
//        if(isDuplicate.size()!=0){
//            //排重有重复
//            return isDuplicate;
//        }insertintoRybd
        return result;
    }

    //姓名加手机号排重
    public List<RepeatVo> checkUserDuplicate(String name, String phone, String flowId, String positionId) {
        return processMapper.checkUserDuplicate(name, phone, flowId, positionId);
    }

    //提取性别
    public static String getName(String text) {
        String eg = "[^\u4E00-\u9FA5]男|女[^\u4E00-\u9FA5]";
        Pattern pattern = Pattern.compile(eg);
        Matcher sexMatcher = pattern.matcher(text);
        while (sexMatcher.find()) {
            // 提取其中的汉字部分
            String regExp = "[\u4E00-\u9FA5]";
            Pattern sexPattern = Pattern.compile(regExp);
            Matcher matcher = sexPattern.matcher(sexMatcher.group());
            while (matcher.find()) {
                return matcher.group();
            }
        }
        return null;
    }

    //提取民族
    public static String getNation(String text) {
        String eg = "(?<=民族[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{1,9}";
        Pattern sexPattern = Pattern.compile(eg);
        Matcher matcher = sexPattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    //出生日期
    public static String getBirthday(String text) {
        String eg = "(?<=(出生日期|出生年月|生日)[\\S]?[\\s]{0,6})\\d{4}(\\s)*(\\-|\\/|\\.|年)?(\\s)*\\d{1,2}(\\s)*(\\-|\\/|\\.|月|⽉)?((\\s)*\\d{1,2}(\\s)*(日|⽇)?)?";
        Pattern sexPattern = Pattern.compile(eg);
        Matcher matcher = sexPattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    //政治面貌
    public static String getpoliticalStatus(String text) {
        String eg = "(?<=(政治⾯貌|政治面貌)[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{1,9}";
        Pattern sexPattern = Pattern.compile(eg);
        Matcher matcher = sexPattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    //时间日期
/*    public static List<String> getDate(String text) {
        String eg = "\\d{4}(\\-|\\/|\\D)\\d{1,2}(\\-|\\/|\\D)(\\d{1,2}(\\D))?";
        Pattern datePattern = Pattern.compile(eg);
        Matcher matcher = datePattern.matcher(text);
        List<String> dateList = new ArrayList<>();
        while (matcher.find()) {
            dateList.add(matcher.group());
        }
        return dateList;
    }*/

    //时间格式转换为统一格式
    private String dateFormat(String date) {
        if (StrUtil.isNotBlank(date)) {
            date = date.trim();
            // 判断是否为八位纯数字
            if (date.length() == 8){
                boolean matches = date.matches("[0-9]{8}");
                if (matches) {
                    //将时间按照固定格式拼接返回
                    StringBuffer sb = new StringBuffer(date);
                    sb.insert(4,'-');
                    sb.insert(7,'-');
                    return sb.toString();
                }
            }
            date = date.replaceAll(" ","").replaceAll("\\D","-");
            //处理日期最后一位
            char c = date.charAt(date.length() - 1);
            if (!(c >= '0' && c <= '9')) {
                date = date.substring(0, date.length() - 1);
            }
            String[] split = date.split("-");
            //转换纯数字年月:202201 =>2022-01
            if (split[0].length() > 4) {
                StringBuilder sb = new StringBuilder(date);
                sb.insert(4,'-');
                date =  sb.toString();
                split = date.split("-");
            }
            //处理月份为一位的
            if (split[1].length() == 1) {
                split[1] = "0" + split[1];
                String join = Joiner.on("-").join(split);
                return join;
            }
            return date;
        }
        return null;
    }

    //现居地
    public static String getCurrentAddress(String text) {
        String eg = "(?<=(现居住地|现居地|居住地|所在地|现所在地)[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{2,9}";
        Pattern datePattern = Pattern.compile(eg);
        Matcher matcher = datePattern.matcher(text);
        while (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

 

Java代码实现智能简历解析系统,需要用到自然语言处理(NLP)和机器学习技术,主要步骤如下: 1. 数据采集和预处理:从各种渠道收集大量的简历数据,并进行数据清洗和预处理,去除无用信息,提取关键信息,例如姓名、联系方式、教育经历、工作经历等。 2. 特征提取:根据预处理后的数据,提取出重要的特征,例如学历、专业、工作年限、技能等,并将其转化为数值型特征,以便于机器学习算法处理。 3. 模型训练和选择:选择合适的机器学习算法,例如决策树、随机森林、逻辑回归等,根据提取的特征对模型进行训练,以实现从简历中提取关键信息的目的。 4. 简历解析:使用训练好的模型对新的简历进行解析,提取出相应的信息,例如姓名、联系方式、教育经历、工作经历等,并将其存储到数据库中。 5. 智能推荐:根据解析后的信息,结合职位需求,进行匹配和推荐合适的候选人,以提高招聘效率和准确性。 下面是一个简单的Java实现示例: ```java // 数据预处理 public class ResumePreprocessor { public static Resume preprocess(String resumeText) { // 去除非文字信息,例如图片、表格等 String text = removeNonTextInformation(resumeText); // 提取关键信息,例如姓名、联系方式、教育经历、工作经历等 String name = extractName(text); String email = extractEmail(text); List<Education> educationList = extractEducation(text); List<Experience> experienceList = extractExperience(text); // 构建简历对象 Resume resume = new Resume(name, email, educationList, experienceList); return resume; } } // 特征提取 public class ResumeFeatureExtractor { public static Map<String, Double> extractFeatures(Resume resume) { Map<String, Double> featureMap = new HashMap<>(); // 提取学历、专业、工作年限、技能等特征,并转化为数值型特征 double educationLevel = extractEducationLevel(resume); double workYears = extractWorkYears(resume); double skillLevel = extractSkillLevel(resume); featureMap.put("educationLevel", educationLevel); featureMap.put("workYears", workYears); featureMap.put("skillLevel", skillLevel); return featureMap; } } // 模型训练和选择 public class ResumeClassifier { private static final String MODEL_FILE = "resume_classifier.model"; private static final List<String> FEATURE_LIST = Arrays.asList("educationLevel", "workYears", "skillLevel"); private DecisionTreeModel model; public ResumeClassifier() { // 加载训练好的模型 model = DecisionTreeModel.load(MODEL_FILE); } public boolean classify(Resume resume) { // 提取特征 Map<String, Double> featureMap = ResumeFeatureExtractor.extractFeatures(resume); List<LabeledPoint> data = new ArrayList<>(); // 将特征转化为LabeledPoint类型,以便于模型预测 LabeledPoint labeledPoint = new LabeledPoint(1.0, Vectors.dense( featureMap.get("educationLevel"), featureMap.get("workYears"), featureMap.get("skillLevel") )); data.add(labeledPoint); Dataset<Row> testData = SparkUtils.spark().createDataFrame(data, LabeledPoint.class).toDF(); // 使用模型进行预测 double prediction = model.predict(testData.head().features()); return prediction == 1.0; } } // 简历解析 public class ResumeParser { private static final String RESUME_FOLDER = "resumes"; private static final String DATABASE_URL = "jdbc:mysql://localhost:3306/resume_db"; private static final String USERNAME = "root"; private static final String PASSWORD = "password"; private static final String DRIVER_CLASS = "com.mysql.jdbc.Driver"; private static final String INSERT_SQL = "INSERT INTO resume (name, email, education, experience) VALUES (?, ?, ?, ?)"; public static void main(String[] args) { // 获取简历文件列表 List<File> resumeFiles = getResumeFiles(RESUME_FOLDER); // 初始化数据库连接 try (Connection conn = DriverManager.getConnection(DATABASE_URL, USERNAME, PASSWORD)) { // 解析每个简历文件,并将其存储到数据库中 for (File resumeFile : resumeFiles) { String resumeText = FileUtils.readFileToString(resumeFile, StandardCharsets.UTF_8); Resume resume = ResumePreprocessor.preprocess(resumeText); if (resume != null) { boolean isQualified = new ResumeClassifier().classify(resume); if (isQualified) { PreparedStatement stmt = conn.prepareStatement(INSERT_SQL); stmt.setString(1, resume.getName()); stmt.setString(2, resume.getEmail()); stmt.setString(3, gson.toJson(resume.getEducationList())); stmt.setString(4, gson.toJson(resume.getExperienceList())); stmt.executeUpdate(); } } } } catch (SQLException | IOException e) { e.printStackTrace(); } } } // 智能推荐 public class CandidateMatcher { private static final String DATABASE_URL = "jdbc:mysql://localhost:3306/resume_db"; private static final String USERNAME = "root"; private static final String PASSWORD = "password"; private static final String DRIVER_CLASS = "com.mysql.jdbc.Driver"; private static final String SELECT_SQL = "SELECT * FROM resume WHERE education LIKE ? AND experience LIKE ?"; public static List<Resume> findMatches(String education, String experience) { List<Resume> matches = new ArrayList<>(); // 初始化数据库连接 try (Connection conn = DriverManager.getConnection(DATABASE_URL, USERNAME, PASSWORD)) { PreparedStatement stmt = conn.prepareStatement(SELECT_SQL); stmt.setString(1, "%" + education + "%"); stmt.setString(2, "%" + experience + "%"); ResultSet rs = stmt.executeQuery(); // 将匹配的简历转化为对象 while (rs.next()) { String name = rs.getString("name"); String email = rs.getString("email"); String educationJson = rs.getString("education"); String experienceJson = rs.getString("experience"); List<Education> educationList = gson.fromJson(educationJson, new TypeToken<List<Education>>() {}.getType()); List<Experience> experienceList = gson.fromJson(experienceJson, new TypeToken<List<Experience>>() {}.getType()); Resume resume = new Resume(name, email, educationList, experienceList); matches.add(resume); } } catch (SQLException e) { e.printStackTrace(); } return matches; } } ``` 以上代码仅供参考,实际实现需要根据具体需求进行调整和优化。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值