依赖:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
controller:
@DataLog(operationName = "批量新增人才", operationDesc = "上传文件", methodType = MethodTypeEnum.ADD_TYPE)
@ApiOperation(value = "批量新增人才", notes = "批量新增人才")
@PostMapping(Urls.RecA01.uploadFileAddA01)
@ApiImplicitParams({
@ApiImplicitParam(name = "state", value = "人才库类型", required = true)
})
public JsonObject<Object> uploadFileAddA01(@RequestParam(value = "file") MultipartFile[] file, String state, HttpServletRequest request) throws IOException {
String result = "";
Map<String, String> param = new HashMap<>();
param.put("state", state);
userDir = fileConfigProperties.getResources();
// 存放文件目录
//TODO 此处获取了配置文件信息的值 需在配置文件新增属性
String folderSources = userDir + File.separator + "temp" + File.separator + "rckfile";
// 设置日期为文件夹名称
SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
String folderName = df.format(new Date()).toString();
String folderPath = folderSources + File.separator + folderName;
File rmbFile = new File(folderPath);
// 文件夹如果不存在就创建该文件夹
if (!rmbFile.exists()) {
rmbFile.mkdirs();
}
for (int i = 0; file != null && i < file.length; i++) {
// 获得文件名:
String name = file[i].getOriginalFilename();
// 获得文件名,不带后缀
String filename = name.substring(0, name.lastIndexOf('.'));
// 获得输入流:
InputStream streamList = file[i].getInputStream();
File newPhotoFile = new File(folderPath + File.separator + name);
OutputStream out = new FileOutputStream(newPhotoFile);
// 保存文件
boolean writeFlag = this.write(streamList, out);
if (writeFlag) {
out.flush();
out.close();
}
}
// 处理文件夹中文件数据
result = recA01Service.uploadFileAddA01(folderPath, param, request);
return new JsonSuccessObject<>(result);
}
public boolean write(InputStream in, OutputStream out) {
boolean flag = true;
int BUFSIZE = 65536;
int s;
try {
byte[] buf = new byte[BUFSIZE];
while ((s = in.read(buf)) > -1) {
out.write(buf, 0, s);
}
} catch (IOException e) {
flag = false;
e.printStackTrace();
logger.error("异常信息:" + e.getMessage());
}
return flag;
}
impl:
@Value("${ocrServe.url}")
private String ocrUrl;
public String getocrUrl() {
return ocrUrl;
}
@Override
public String uploadFileAddA01(String path, Map<String, String> param, HttpServletRequest request) throws IOException {
// 获取文件夹下文件
File file = new File(path);
File[] files = file.listFiles();
// 循环处理文件
for (File f : files) {
//文件名称
String fileName = f.getName();
//文件类型
String suffix = fileName.substring(fileName.lastIndexOf('.') + 1);
//人才库类型
String state = param.get("state");
String text = "";
// 解析简历
if (("doc").equals(suffix) || ("docx").equals(suffix)) {
try {
if (("doc").equals(suffix)) {
// 根据文件流获得文档对象
InputStream is = new FileInputStream(f);
WordExtractor re = new WordExtractor(is);
text = re.getText();
re.close();
} else if (("docx").equals(suffix)) {
OPCPackage opcPackage = POIXMLDocument.openPackage(f.getPath());
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
text = extractor.getText();
extractor.close();
}
} catch (Exception e1) {
e1.printStackTrace();
}
} else if (("pdf").equals(suffix)) {
PDDocument document = null;
try {
document = PDDocument.load(f);
int pageSize = document.getNumberOfPages();
// 一页一页读取
for (int i = 0; i < pageSize; i++) {
// 文本内容
PDFTextStripper stripper = new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(i + 1);
stripper.setEndPage(i + 1);
text = text + stripper.getText(document);
}
} catch (IOException e) {
} finally {
document.close();
}
} else if (("htm").equals(suffix) || ("html").equals(suffix)) {
// 获取HTML文件流
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(f), "utf-8"));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
// HTML文件字符串
text = htmlSb.toString();
} else if (("png").equals(suffix) || ("jpg").equals(suffix) || ("jpeg").equals(suffix)) {
// post请求 http://192.168.4.188:8869/paddle_ocr
HttpClient client = new HttpClient();
PostMethod postMethod = new PostMethod(getocrUrl());
try {
// FilePart:用来上传文件的类,file即要上传的文件
FilePart fp = new FilePart("file", f);
Part[] parts = {fp};
// 对于MIME类型的请求,httpclient建议全用MulitPartRequestEntity进行包装
MultipartRequestEntity mre = new MultipartRequestEntity(parts, postMethod.getParams());
postMethod.setRequestEntity(mre);
// 由于要上传的文件可能比较大 , 因此在此设置最大的连接超时时间
client.getHttpConnectionManager().getParams().setConnectionTimeout(50000);
int status = client.executeMethod(postMethod);
if (status == HttpStatus.SC_OK) {
// 获取返回数据
InputStream inputStream = postMethod.getResponseBodyAsStream();
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
//<1>创建字节数组输出流,用来输出读取到的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
//<2>创建缓存大小
byte[] buffer = new byte[1024]; // 1KB
//每次读取到内容的长度
int len = -1;
//<3>开始读取输入流中的内容
while ((len = inputStream.read(buffer)) != -1) { //当等于-1说明没有数据可以读取了
baos.write(buffer, 0, len); //把读取到的内容写到输出流中
}
//<4> 把字节数组转换为字符串
String json = baos.toString();
// 获取"text": "女,17302731183", "text_region":中间的信息
String regEx = "(\"text\": \")(.*?)(\", \"text_region\")";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(json);
while (m.find()) {
text = text + m.group(2);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
// 释放连接
postMethod.releaseConnection();
}
} else if (("ppt").equals(suffix) || ("pptx").equals(suffix)) {
StringBuilder sb = new StringBuilder();
FileInputStream in = new FileInputStream(f.getPath());
try {
XMLSlideShow xmlSlideShow = new XMLSlideShow(in);
List<XSLFSlide> slides = xmlSlideShow.getSlides();
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape gs = rawSlide.getCSld().getSpTree();
CTShape[] shapes = gs.getSpArray();
for (CTShape shape : shapes) {
CTTextBody tb = shape.getTxBody();
if (null == tb) {
continue;
}
CTTextParagraph[] paras = tb.getPArray();
for (CTTextParagraph textParagraph : paras) {
CTRegularTextRun[] textRuns = textParagraph.getRArray();
for (CTRegularTextRun textRun : textRuns) {
sb.append(textRun.getT());
}
}
}
}
text = sb.toString();
xmlSlideShow.close();
} catch (Exception e) {
e.printStackTrace();
}
}
// 提取姓名
String name = getName(text);
// 提取手机号
String phone = getPhoneNo(text);
// 提取邮箱
String email = getEmail(text);
if (name != null && name != "") {
// 添加人才库方法
this.saveRecA01(name, phone, email, null, state, request);
} else {
return "该简历中不存在用户姓名!";
}
}
return "导入成功";
}
application.yml
提取姓名方法,不确定人名是一位还是两位的情况
public static String getName(String str) {
// 定义姓名为开头
String start = "(姓名)([\\s\\S]*)";
// 定义姓氏开头的正则
String surname = "(王|李|张|刘|陈|杨|黄|赵|吴|周|徐|孙|马|朱|胡|郭|何|高|林|罗|郑|梁|谢|宋|唐|许|韩|冯|邓|曹|彭|曾" +
"|养|须|丰|巢|蒯|相|后|红|权逯|盖益|桓|公|万俟|司马|上官|夏侯|诸葛|闻人|东方|赫连|皇甫|尉迟|公羊|澹台" +
"|公冶|宗政|濮阳|淳于|单于|太叔|申屠|公孙|仲孙|轩辕|令狐|钟离|宇文|长孙|慕容|鲜于|闾丘|司徒|司空|亓官" +
"|司寇|仉|督|子车|颛孙|端木|巫马|公西|漆雕|乐正|壤驷|公良|拓跋|夹谷|宰父|谷粱|法|汝|钦|段干|百里|东郭" +
"|南门|呼延|归海|羊舌|微生|帅|缑|亢|况|郈|琴|梁丘|左丘|东门|西门|佘|佴|伯|赏|南宫|墨|哈|谯" +
"|肖|田|董|袁|潘|于|蒋|蔡|余|杜|叶|程|苏|魏|吕|丁|任|沈|姚|卢|姜|崔|钟|谭|陆|汪|范|金|石|廖|贾|夏|韦|傅" +
"|方|白|邹|孟|熊|秦|邱|江|尹|薛|闫|段|雷|侯|龙|史|黎|贺|顾|毛|郝|龚|邵|万|钱|覃|武|戴|孔|汤|庞|樊|兰|殷" +
"|施|陶|洪|翟|安|颜|倪|严|牛|温|芦|季|俞|章|鲁|葛|伍|申|尤|毕|聂|柴|焦|向|柳|邢|岳|齐|沿|梅|莫|庄|辛|管" +
"|祝|左|涂|谷|祁|时|舒|耿|牟|卜|路|詹|关|苗|凌|费|纪|靳|盛|童|欧|甄|项|曲|成|游|欧阳|裴|席|卫|查|屈|鲍|位" +
"|覃|霍|翁|隋|植|甘|景|薄|单|包|司|柏|宁|柯|阮|桂|闵|阳|解|强|丛|华|车|冉|房|边|辜|吉|饶|刁|瞿|戚|丘" +
"|古|米|池|滕|晋|苑|邬|臧|畅|宫|来|嵺|苟|全|褚|廉|简|娄|盖|符|奚|木|穆|党|燕|郎|邸|冀|谈|姬|屠|连|郜|晏" +
"|栾|郁|商|蒙|计|喻|揭|窦|迟|宇|敖|糜|鄢|冷|卓|花|艾|蓝|都|巩|稽|井|练|仲|乐|虞|卞|封|竺|冼|原|官|衣|楚" +
"|佟|栗|匡|宗|应|台|巫|鞠|僧|桑|荆|谌|银|扬|明|沙|薄|伏|岑|习|胥|保|和|蔺|水|云|昌|凤|酆|常|皮|康|元|平" +
"|萧|湛|禹|无|贝|茅|麻|危|骆|支|咎|经|裘|缪|干|宣|贲|杭|诸|钮|嵇|滑|荣|荀|羊|於|惠|家|芮|羿|储|汲|邴|松" +
"|富|乌|巴|弓|牧|隗|山|宓|蓬|郗|班|仰|秋|伊|仇|暴|钭|厉|戎|祖|束|幸|韶|蓟|印|宿|怀|蒲|鄂|索|咸|籍|赖|乔" +
"|阴|能|苍|双|闻|莘|贡|逢|扶|堵|宰|郦|雍|却|璩|濮|寿|通|扈|郏|浦|尚|农|别|阎|充|慕|茹|宦|鱼|容|易|慎|戈" +
"|庚|终|暨|居|衡|步|满|弘|国|文|寇|广|禄|阙|东|殴|殳|沃|利|蔚|越|夔|隆|师|厍|晃|勾|融|訾|阚|那|空|毋|乜" +
"|笪|年|爱|仝|代)";
// 中文正则
String type = "([\u4E00-\u9FA5])";
// 判断有一个字还是两个字
// 取到姓名及数据
for (int i = 0; i < 2; i++) {
String regEx = "";
if (i == 0) {
regEx = start + surname + type + type;
} else {
regEx = start + surname + type;
}
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
while (m.find()) {
// 姓名:张三
String personName = m.group();
// 取到数据
String name = "";
if (personName != null) {
String regEx1 = "";
if (i == 0) {
regEx1 = surname + type + type;
} else {
regEx1 = surname + type;
}
Pattern p1 = Pattern.compile(regEx1);
Matcher m1 = p1.matcher(personName);
while (m1.find()) {
name = m1.group();
return name;
}
}
}
}
return null;
}
提取手机号
public static String getPhoneNo(String text) {
//手机号正则
String regExp = "(1[3-9]\\d{9})";
Pattern phonePattern = Pattern.compile(regExp);
Matcher matcher = phonePattern.matcher(text);
while (matcher.find()) {
return matcher.group();
}
return null;
}
提取邮箱
public static String getEmail(String text) {
//邮箱正则
String regex = "[a-za-z0-9_-]+@\\w+\\.[a-z]+(]+)?";
Pattern phonePattern = Pattern.compile(regex);
Matcher matcher = phonePattern.matcher(text);
while (matcher.find()) {
return matcher.group();
}
return null;
}