效果应该是这样的(图片借鉴某网站的简历上传)
代码很多,这里记录一下,回头有空了再研究研究
controller
@ApiOperation(value = "简历解析", notes = "简历解析")
@PostMapping(Urls.ResumeSelection.uploadFileAddA01)
@DataLog(operationName = "简历解析", logType = LogTypeEnum.DATA_LOG, methodType = MethodTypeEnum.UPLOAD_TYPE)
@ApiOperationSupport(order = 29)
public JsonObject<Object> uploadFileAddA01(@RequestParam(value = "file") MultipartFile[] file, HttpServletRequest request) throws IOException {
try {
List<Map<String, Object>> result = new ArrayList<>();
userDir = fileConfigProperties.getResources();
// 存放文件目录
//TODO 此处获取了配置文件信息的值 需在配置文件新增属性
String folderSources = userDir + File.separator + "temp" + File.separator + "rckfile";
// 设置日期为文件夹名称
SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
String folderName = df.format(new Date()).toString();
String folderPath = folderSources + File.separator + folderName;
File rmbFile = new File(folderPath);
// 文件夹如果不存在就创建该文件夹
if (!rmbFile.exists()) {
rmbFile.mkdirs();
}
List<File> fileList = new ArrayList();
for (int i = 0; file != null && i < file.length; i++) {
// 获得文件名:
String name = file[i].getOriginalFilename();
// 获得文件名,不带后缀
// String filename = name.substring(0, name.lastIndexOf('.'));
// 获得输入流
InputStream streamList = null;
File newPhotoFile = new File(folderPath + File.separator + name);
OutputStream out = null;
boolean writeFlag = true;
try {
streamList = file[i].getInputStream();
out = new FileOutputStream(newPhotoFile);
fileList.add(newPhotoFile);
// 保存文件
writeFlag = this.write(streamList, out);
} catch (FileNotFoundException e) {
// e.printStackTrace();
logger.error(e.getMessage(),e);
} finally {
if (writeFlag && out != null) {
out.flush();
out.close();
}
}
}
// 处理文件夹中文件数据
result = processService.uploadFileAddA01(fileList, request);
return new JsonSuccessObject<>(result);
} catch (IOException | NullPointerException e) {
logger.error("文件解析失败:" + e.getMessage());
throw new BusinessException("文件解析失败!");
}
}
public boolean write(InputStream in, OutputStream out) {
boolean flag = true;
int BUFSIZE = 65536;
int s;
try {
byte[] buf = new byte[BUFSIZE];
while ((s = in.read(buf)) > -1) {
out.write(buf, 0, s);
}
} catch (IOException e) {
flag = false;
// e.printStackTrace();
logger.error("异常信息:" + e.getMessage());
}
return flag;
}
Service
List<Map<String, Object>> uploadFileAddA01(List<File> fileList, HttpServletRequest request) throws IOException;
impl
@Override
public List<Map<String, Object>> uploadFileAddA01(List<File> fileList, HttpServletRequest request) throws IOException {
List<Map<String, Object>> result = new ArrayList<>();
// 循环处理文件
for (File f : fileList) {
//文件名称
String fileName = f.getName();
//文件类型
String suffix = fileName.substring(fileName.lastIndexOf('.') + 1);
String text = "";
// 解析简历
if (("doc").equals(suffix) || ("docx").equals(suffix)) {
try {
if (("doc").equals(suffix)) {
// 根据文件流获得文档对象
InputStream is = new FileInputStream(f);
WordExtractor re = new WordExtractor(is);
text = re.getText();
re.close();
} else if (("docx").equals(suffix)) {
OPCPackage opcPackage = POIXMLDocument.openPackage(f.getPath());
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
text = extractor.getText();
extractor.close();
}
} catch (Exception e1) {
// e1.printStackTrace();
logger.error("异常信息:" + e1.getMessage());
}
} else if (("pdf").equals(suffix)) {
PDDocument document = null;
try {
document = PDDocument.load(f);
int pageSize = document.getNumberOfPages();
// 一页一页读取
if(pageSize <= 50){
for (int i = 0; i < pageSize; i++) {
// 文本内容
PDFTextStripper stripper = new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(i + 1);
stripper.setEndPage(i + 1);
text = text + stripper.getText(document);
}
}
} catch (IOException e) {
logger.error(e.getMessage(),e);
} finally {
if(document != null){
document.close();
}
}
} else if (("htm").equals(suffix) || ("html").equals(suffix)) {
// 获取HTML文件流
StringBuffer htmlSb = new StringBuffer();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(
new FileInputStream(f), "utf-8"));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
} catch (FileNotFoundException e) {
// e.printStackTrace();
logger.error("异常信息:" + e.getMessage(),e);
}finally {
if(br != null){
br.close();
}
}
// HTML文件字符串
text = htmlSb.toString();
} else if (("png").equals(suffix) || ("jpg").equals(suffix) || ("jpeg").equals(suffix)) {
// post请求 http://192.168.4.188:8869/paddle_ocr
HttpClient client = new HttpClient();
PostMethod postMethod = new PostMethod(getocrUrl());
try {
// FilePart:用来上传文件的类,file即要上传的文件
FilePart fp = new FilePart("file", f);
Part[] parts = {fp};
// 对于MIME类型的请求,httpclient建议全用MulitPartRequestEntity进行包装
MultipartRequestEntity mre = new MultipartRequestEntity(parts, postMethod.getParams());
postMethod.setRequestEntity(mre);
// 由于要上传的文件可能比较大 , 因此在此设置最大的连接超时时间
client.getHttpConnectionManager().getParams().setConnectionTimeout(50000);
int status = client.executeMethod(postMethod);
if (status == HttpStatus.SC_OK) {
// 获取返回数据
InputStream inputStream = postMethod.getResponseBodyAsStream();
// BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
//<1>创建字节数组输出流,用来输出读取到的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
//<2>创建缓存大小
byte[] buffer = new byte[1024]; // 1KB
//每次读取到内容的长度
int len = -1;
//<3>开始读取输入流中的内容
while ((len = inputStream.read(buffer)) != -1) { //当等于-1说明没有数据可以读取了
baos.write(buffer, 0, len); //把读取到的内容写到输出流中
}
//<4> 把字节数组转换为字符串
String json = baos.toString();
// 获取"text": "女,17302731183", "text_region":中间的信息
String regEx = "(\"text\": \")(.*?)(\", \"text_region\")";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(json);
while (m.find()) {
text = text + m.group(2);
}
baos.close();
inputStream.close();
}
} catch (Exception e) {
// e.printStackTrace();
logger.error("异常信息:" + e.getMessage(),e);
} finally {
// 释放连接
postMethod.releaseConnection();
}
} else if (("ppt").equals(suffix) || ("pptx").equals(suffix)) {
StringBuilder sb = new StringBuilder();
FileInputStream in = new FileInputStream(f.getPath());
try {
XMLSlideShow xmlSlideShow = new XMLSlideShow(in);
List<XSLFSlide> slides = xmlSlideShow.getSlides();
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape gs = rawSlide.getCSld().getSpTree();
CTShape[] shapes = gs.getSpArray();
for (CTShape shape : shapes) {
CTTextBody tb = shape.getTxBody();
if (null == tb) {
continue;
}
CTTextParagraph[] paras = tb.getPArray();
for (CTTextParagraph textParagraph : paras) {
CTRegularTextRun[] textRuns = textParagraph.getRArray();
for (CTRegularTextRun textRun : textRuns) {
sb.append(textRun.getT());
}
}
}
}
text = sb.toString();
xmlSlideShow.close();
} catch (Exception e) {
// e.printStackTrace();
logger.error("异常信息:" + e.getMessage(),e);
}
in.close();
}
Map<String, Object> saveCandidateVo = new HashMap<>();
if (StrUtil.isNotBlank(text)) {
// 清除关键字中的空格
text = text.replaceFirst("姓[\\s]*名[\\s]*", "姓名").replaceFirst("性[\\s]*别[\\s]*", "性别")
.replaceFirst("民[\\s]*族[\\s]*", "民族").replaceFirst("现[\\s]*居[\\s]*住[\\s]*地[\\s]*", "现居住地")
.replaceFirst("现[\\s]*居[\\s]*地[\\s]*", "现居地").replaceFirst("居[\\s]*住[\\s]*地[\\s]*", "居住地")
.replaceFirst("所[\\s]*在[\\s]*地[\\s]*", "所在地").replaceFirst("现[\\s]*所[\\s]*在[\\s]*地[\\s]*", "现所在地")
.replaceFirst("生[\\s]*日[\\s]*", "生日");
// 提取姓名
String name = RecA01ServiceImpl.getName(text);
saveCandidateVo.put("r0101", name);
//提取性别 需要匹配代码表gb22611
String sex = getName(text);
if (StrUtil.isNotBlank(sex)) {
String code = processMapper.getDmCodeByDmCpt("gb22611", sex);
saveCandidateVo.put("r0102", code);
} else {
saveCandidateVo.put("r0102", null);
}
//提取民族
String nation = getNation(text);
if (StrUtil.isNotBlank(nation)) {
String code = processMapper.getDmCodeByDmCpt("gb3304", nation);
saveCandidateVo.put("r0103", code);
} else {
saveCandidateVo.put("r0103", null);
}
//提取出生日期
String birthday = getBirthday(text);
if (StrUtil.isNotBlank(birthday))
birthday = dateFormat(birthday);
saveCandidateVo.put("r0104", birthday);
//政治面貌 需要匹配代码表gb4762
String politicalStatus = getpoliticalStatus(text);
if (StrUtil.isNotBlank(politicalStatus)) {
String code = processMapper.getDmCodeByDmCpt("gb4762", politicalStatus);
saveCandidateVo.put("r0105", code);
} else {
saveCandidateVo.put("r0105", null);
}
//开始工作年份
//saveCandidateVo.put("startWorkYear","2021/02/02");
// 提取手机号
String phone = RecA01ServiceImpl.getPhoneNo(text);
if (StrUtil.isNotBlank(phone)) {
phone = phone.replaceAll("\\D", "");
}
saveCandidateVo.put("r0114", phone);
// 提取邮箱
String email = RecA01ServiceImpl.getEmail(text);
if (StrUtil.isNotBlank(email)) {
email = email.replaceAll("\\s", "");
}
saveCandidateVo.put("r0115", email);
//提取时间
// List<String> dateList = getDate(text);
// List<String> dateFormatList = new ArrayList<>();
// if (CollectionUtil.isNotEmpty(dateList)) {
// dateList.forEach(v -> {
// String format = dateFormat(v);
// if (StrUtil.isNotBlank(format)) {
// dateFormatList.add(format);
// }
// });
// }
// saveCandidateVo.put("date",dateFormatList);
//提取现居地
saveCandidateVo.put("r0110", getCurrentAddress(text));
//将提取出来的信息,放入结果集
result.add(saveCandidateVo);
}
//姓名加手机号进行排重
// List<Map<String, Object>> isDuplicateUser = this.checkUserDuplicate(name,phone,resumeflowid,positionid);
// isDuplicate.addAll(isDuplicateUser);
}
// if(isDuplicate.size()!=0){
// //排重有重复
// return isDuplicate;
// }insertintoRybd
return result;
}
//姓名加手机号排重
public List<RepeatVo> checkUserDuplicate(String name, String phone, String flowId, String positionId) {
return processMapper.checkUserDuplicate(name, phone, flowId, positionId);
}
//提取性别
public static String getName(String text) {
String eg = "[^\u4E00-\u9FA5]男|女[^\u4E00-\u9FA5]";
Pattern pattern = Pattern.compile(eg);
Matcher sexMatcher = pattern.matcher(text);
while (sexMatcher.find()) {
// 提取其中的汉字部分
String regExp = "[\u4E00-\u9FA5]";
Pattern sexPattern = Pattern.compile(regExp);
Matcher matcher = sexPattern.matcher(sexMatcher.group());
while (matcher.find()) {
return matcher.group();
}
}
return null;
}
//提取民族
public static String getNation(String text) {
String eg = "(?<=民族[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{1,9}";
Pattern sexPattern = Pattern.compile(eg);
Matcher matcher = sexPattern.matcher(text);
while (matcher.find()) {
return matcher.group();
}
return null;
}
//出生日期
public static String getBirthday(String text) {
String eg = "(?<=(出生日期|出生年月|生日)[\\S]?[\\s]{0,6})\\d{4}(\\s)*(\\-|\\/|\\.|年)?(\\s)*\\d{1,2}(\\s)*(\\-|\\/|\\.|月|⽉)?((\\s)*\\d{1,2}(\\s)*(日|⽇)?)?";
Pattern sexPattern = Pattern.compile(eg);
Matcher matcher = sexPattern.matcher(text);
while (matcher.find()) {
return matcher.group();
}
return null;
}
//政治面貌
public static String getpoliticalStatus(String text) {
String eg = "(?<=(政治⾯貌|政治面貌)[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{1,9}";
Pattern sexPattern = Pattern.compile(eg);
Matcher matcher = sexPattern.matcher(text);
while (matcher.find()) {
return matcher.group();
}
return null;
}
//时间日期
/* public static List<String> getDate(String text) {
String eg = "\\d{4}(\\-|\\/|\\D)\\d{1,2}(\\-|\\/|\\D)(\\d{1,2}(\\D))?";
Pattern datePattern = Pattern.compile(eg);
Matcher matcher = datePattern.matcher(text);
List<String> dateList = new ArrayList<>();
while (matcher.find()) {
dateList.add(matcher.group());
}
return dateList;
}*/
//时间格式转换为统一格式
private String dateFormat(String date) {
if (StrUtil.isNotBlank(date)) {
date = date.trim();
// 判断是否为八位纯数字
if (date.length() == 8){
boolean matches = date.matches("[0-9]{8}");
if (matches) {
//将时间按照固定格式拼接返回
StringBuffer sb = new StringBuffer(date);
sb.insert(4,'-');
sb.insert(7,'-');
return sb.toString();
}
}
date = date.replaceAll(" ","").replaceAll("\\D","-");
//处理日期最后一位
char c = date.charAt(date.length() - 1);
if (!(c >= '0' && c <= '9')) {
date = date.substring(0, date.length() - 1);
}
String[] split = date.split("-");
//转换纯数字年月:202201 =>2022-01
if (split[0].length() > 4) {
StringBuilder sb = new StringBuilder(date);
sb.insert(4,'-');
date = sb.toString();
split = date.split("-");
}
//处理月份为一位的
if (split[1].length() == 1) {
split[1] = "0" + split[1];
String join = Joiner.on("-").join(split);
return join;
}
return date;
}
return null;
}
//现居地
public static String getCurrentAddress(String text) {
String eg = "(?<=(现居住地|现居地|居住地|所在地|现所在地)[\\S]?[\\s]{0,6})[\u4E00-\u9FA5]{2,9}";
Pattern datePattern = Pattern.compile(eg);
Matcher matcher = datePattern.matcher(text);
while (matcher.find()) {
return matcher.group();
}
return null;
}