####实现类:
package com.beagledata.gaea.securitydoc.service.impl;
import com.alibaba.fastjson.JSONObject;
import com.beagledata.gaea.securitydoc.common.ResourceResolver;
import com.beagledata.gaea.securitydoc.common.Result;
import com.beagledata.gaea.securitydoc.entity.ReadText;
import com.beagledata.gaea.securitydoc.service.ReadTextService;
import com.beagledata.utils.EncodeUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
-
Created by mahongfei on 2019/8/22.
*/
@Service
public class ReadTextServiceImpl implements ReadTextService{
private Logger logger = LoggerFactory.getLogger(this.getClass());
@Autowired
private ResourceResolver resourceResolver;/**
- @Author: mahongfei
- @description: 读取word文档和execl文档为json并输出到txt
*/
@Override
public Result<List> readText() {
return Result.newSuccess().withData(parseText(getWordText()));
}
/**
- @Author: mahongfei
- @description: 读取docx文档
*/
public String getWordText() {
String text = “”;
String filePath = resourceResolver.getCSRCPath() + “/” + “证监会9.docx”; //word文档地址
try {
if (filePath.endsWith(".docx")) { //docx为后缀的
XWPFWordExtractor docx = new XWPFWordExtractor(POIXMLDocument.openPackage(filePath));
text = docx.getText();
} else {
logger.info(“该文档不是docx结尾的”);
}
} catch (Exception e) {
logger.info(“读取docx文档错误”);
logger.error(e.getLocalizedMessage(), e);
}
return text;
}
/**
-
@Author: mahongfei
-
@description: 解析word和execl文档内容
*/
public List parseText(String text){
List list = new ArrayList<>();
List list1 = readExecl();
List list2 = new ArrayList<>();
try {
String categoryTexts[] = text.split(“◎”);
for (int i = 0; i< categoryTexts.length; i++) {
// List list1 = new ArrayList<>(); //分类获取文档
String categoryText = categoryTexts[i];
String documents[] = categoryText.split("#conend#");
for (int j = 0; j < documents.length; j++) {
String document = documents[j];
if (StringUtils.isNotBlank(document)) {
if (document.contains("#title#")) {
String title = document.substring(document.indexOf(".") + 1, document.indexOf("#title#"));
title = title.replaceAll("\n", “”);
String content = document.substring(document.indexOf("#constart#")+ 10);
content = content.replaceAll("\t", “”);
content = parseBr(content, “\n”);
content = content.replaceAll("\n", “
”);
for (ReadText readText : list1) {
if (title.trim().equals(readText.getTitle().trim())) {
readText.setContent(content);
if (list2.contains(title.trim())){
System.out.println(“重复的docx文章:” + title.trim());
} else {
list2.add(title.trim());
list.add(readText);
}
}
}
// list1.add(readText);
}
}
}/* StringBuilder sb = new StringBuilder(); readExecl(list1); sb.append("["); for (int k = 0; k < list1.size(); k++) { ReadText readText = list1.get(k); JSONObject jsonObject = new JSONObject(); if (k == list1.size()-1) { sb.append(jsonObject.toJSONString(readText)); } else { sb.append(jsonObject.toJSONString(readText)).append(","); } } sb.append("]"); String filePath = resourceResolver.getCSRCPath() + i + ".txt";//输出的json地址 File file = new File(filePath); if (!file.exists()) { file.createNewFile(); } FileOutputStream fos = new FileOutputStream(filePath); fos.write(sb.toString().getBytes()); fos.close();*/ } StringBuilder sb = new StringBuilder(); System.out.println("文档数量:" + list.size()); sb.append("["); for (int k = 0; k < list.size(); k++) { ReadText readText = list.get(k); JSONObject jsonObject = new JSONObject(); if (k == list.size()-1) { sb.append(jsonObject.toJSONString(readText)); } else { sb.append(jsonObject.toJSONString(readText)).append(","); } } sb.append("]"); String filePath = resourceResolver.getCSRCPath() + "/" + "证监会.txt";//输出的json地址 File file = new File(filePath); if (!file.exists()) { file.createNewFile(); } FileOutputStream fos = new FileOutputStream(filePath); fos.write(sb.toString().getBytes()); fos.close();
} catch (Exception e) {
logger.info(“对获取到的文档信息解析错误”);
logger.error(e.getLocalizedMessage(), e);
}
return list;
}
/**
- @Author: mahongfei
- @description: 消除正文前后字符
/
public String parseBr(String srcStr, String splitter) {
String regex = “^” + splitter + "|" + splitter + “*$”;
return srcStr.replaceAll(regex, “”);
}
/**
-
@Author: mahongfei
-
@description: 根据word文档标题获取execl文档中的其他信息
*/
public List readExecl() {
List list = new ArrayList<>();
List list1 = new ArrayList<>();
try {
String filePath = resourceResolver.getCSRCPath() + “/” + “证监会.xlsx”; //execl文件地址
File file = new File(filePath);
InputStream input = new FileInputStream(file);
boolean isE2007 = false;
//判断是否是excel2007格式
if(filePath.endsWith(“xlsx”)){
isE2007 = true;
}Workbook wb; //根据文件格式(2003或者2007)来初始化 if(isE2007){ wb = new XSSFWorkbook(input); }else{ wb = new HSSFWorkbook(input); } Sheet sheet = wb.getSheetAt(0); //获得第一个表单 int rowCount = sheet.getLastRowNum()+1; for(int i = 0; i < rowCount;i++){ Row row ; row = sheet.getRow(i); ReadText readText = new ReadText(); readText.setTag1(String.valueOf(row.getCell(0))); readText.setTag2(String.valueOf(row.getCell(1))); readText.setTag3(String.valueOf(row.getCell(2))); readText.setTag4(String.valueOf(row.getCell(3))); readText.setTitle(String.valueOf(row.getCell(4)).trim()); readText.setPublishOrg(String.valueOf(row.getCell(5))); readText.setDocNum(String.valueOf(row.getCell(6))); readText.setPublishDate(subDate(String.valueOf(row.getCell(7)))); readText.setLevel(String.valueOf(row.getCell(8))); readText.setPublishDateMillis(parseTimestamp(readText.getPublishDate())); readText.setLevelOrder(parseLevelOrder(readText.getLevel())); readText.setDocId(parseDocId(readText.getTitle(), readText.getDocNum())); if (!list1.contains(readText.getTitle().trim())) { list1.add(readText.getTitle().trim()); list.add(readText); } }
} catch (Exception e) {
logger.info(“获取execl文档信息错误”);
logger.error(e.getLocalizedMessage(), e);
}
return list;
}
/**
- @Author: mahongfei
- @description: 拼接替换的日期
*/
public String subDate (String date) {
if (date.contains(“月”) && date.contains("-")) {
date = parseDate(date);
String day = date.substring(0, date.indexOf("-"));
String month = date.substring(date.indexOf("-")+1, date.lastIndexOf("-"));
String year = date.substring(date.lastIndexOf("-")+1);
date = year + “-” + month + “-” + day;
} else if (date.contains(“月”) && date.contains("/")) {
date = parseDate(date);
String day = date.substring(0, date.indexOf("/"));
String month = date.substring(date.indexOf("/")+1, date.lastIndexOf("/"));
String year = date.substring(date.lastIndexOf("/")+1);
date = year + “/” + month + “/” + day;
}
return date;
}
/**
-
@Author: mahongfei
-
@description: 替换日期汉字为数字
*/
public String parseDate (String date) {
if (date.contains(“十二月”)) {
date = date.replaceAll(“十二月”, “12”);
} else if (date.contains(“十一月”)) {
date = date.replaceAll(“十一月”, “11”);
} else if (date.contains(“十月”)) {
date = date.replaceAll(“十月”, “10”);
} else if (date.contains(“九月”)) {
date = date.replaceAll(“九月”, “09”);
} else if (date.contains(“八月”)) {
date = date.replaceAll(“八月”, “08”);
} else if (date.contains(“七月”)) {
date = date.replaceAll(“七月”, “07”);
} else if (date.contains(“六月”)) {
date = date.replaceAll(“六月”, “06”);
} else if (date.contains(“五月”)) {
date = date.replaceAll(“五月”, “05”);
} else if (date.contains(“四月”)) {
date = date.replaceAll(“四月”, “04”);
} else if (date.contains(“三月”)) {
date = date.replaceAll(“三月”, “03”);
} else if (date.contains(“二月”)) {
date = date.replaceAll(“二月”, “02”);
} else if (date.contains(“一月”)) {
date = date.replaceAll(“一月”, “01”);
}return date;
}
/**
-
@Author: mahongfei
-
@description: 获取时间戳
*/
public long parseTimestamp(String text) {
try {
if (text.contains("/")) {
if (text.contains(“发布”) && text.contains(“修订”)) {
text = text.substring(text.indexOf(",") + 1, text.indexOf(“修订”));
return new SimpleDateFormat(“yyyy/MM/dd”).parse(text).getTime();} else if ((!text.contains("发布") && text.contains("修订")) ||(!text.contains("发布") && text.contains("修改"))) { if (text.contains("修订")) { text = text.substring(0, text.indexOf("修订")); } else if (text.contains("修改")){ text = text.substring(0, text.indexOf("修改")); } return new SimpleDateFormat("yyyy/MM/dd").parse(text).getTime(); } else { return new SimpleDateFormat("yyyy/MM/dd").parse(text).getTime(); } } else if (text.contains("-")) { if (text.contains("发布") && text.contains("修订")) { text = text.substring(text.indexOf(",") + 1, text.indexOf("修订")); return new SimpleDateFormat("yyyy-MM-dd").parse(text).getTime(); } else if ((!text.contains("发布") && text.contains("修订")) ||(!text.contains("发布") && text.contains("修改"))) { text = text.substring(0, text.indexOf("修订")); return new SimpleDateFormat("yyyy-MM-dd").parse(text).getTime(); } else { return new SimpleDateFormat("yyyy-MM-dd").parse(text).getTime(); } }
} catch (Exception e) {
logger.info(“获取时间戳错误” + text);
}
return 0;
}
/**
- @Author: mahongfei
- @description: 获取levelOrder
*/
public int parseLevelOrder(String level) {
List levels = Arrays.asList((“部门规章,部门通知、函等,窗口指导文件,法律,规范性文件,行政法规,其他,司法解释,司法文件,自律规则”).split(","));
int levelOrder = levels.size() - levels.indexOf(level);
return levelOrder;
}
/**
- @Author: mahongfei
- @description: 获取docId
*/
public String parseDocId(String title, String docNum) {
String docId = EncodeUtil.encodeMD5(title + docNum);
if (docId.length() >15) {
docId = docId.substring(0, 16);
}
return docId;
}
}
####实体类:
package com.beagledata.gaea.securitydoc.entity;
/**
-
Created by mahongfei on 2019/8/22.
*/
public class ReadText {
private static final long serialVersionUID = -7076621449427697937L;/**
- 一级分类
/
private String tag1;
/* - 二级分类
/
private String tag2;
/* - 三级分类
/
private String tag3;
/* - 四级分类
/
private String tag4;
/* - 名称
/
private String title;
/* - 发布机构
/
private String publishOrg;
/* - 文号
/
private String docNum;
/* - 发布日期
/
private String publishDate;
/* - 效力层级
/
private String level;
/* - 正文
/
private String content;
/* - 层级个数
/
private int levelOrder;
/* - 时间戳
/
private long publishDateMillis;
/* - 文档id
- docId=md5(title+docNum)16位
*/
private String docId;
public String getTag1() {
return tag1;
}public void setTag1(String tag1) {
this.tag1 = tag1;
}public String getTag2() {
return tag2;
}public void setTag2(String tag2) {
this.tag2 = tag2;
}public String getTag3() {
return tag3;
}public void setTag3(String tag3) {
this.tag3 = tag3;
}public String getTag4() {
return tag4;
}public void setTag4(String tag4) {
this.tag4 = tag4;
}public String getTitle() {
return title;
}public void setTitle(String title) {
this.title = title;
}public String getPublishOrg() {
return publishOrg;
}public void setPublishOrg(String publishOrg) {
this.publishOrg = publishOrg;
}public String getDocNum() {
return docNum;
}public void setDocNum(String docNum) {
this.docNum = docNum;
}public String getPublishDate() {
return publishDate;
}public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}public String getLevel() {
return level;
}public void setLevel(String level) {
this.level = level;
}public String getContent() {
return content;
}public void setContent(String content) {
this.content = content;
}public int getLevelOrder() {
return levelOrder;
}public void setLevelOrder(int levelOrder) {
this.levelOrder = levelOrder;
}public long getPublishDateMillis() {
return publishDateMillis;
}public void setPublishDateMillis(long publishDateMillis) {
this.publishDateMillis = publishDateMillis;
}public String getDocId() {
return docId;
}public void setDocId(String docId) {
this.docId = docId;
}@Override
public String toString() {
return “ReadText{” +
“tag1=’” + tag1 + ‘’’ +
“, tag2=’” + tag2 + ‘’’ +
“, tag3=’” + tag3 + ‘’’ +
“, tag4=’” + tag4 + ‘’’ +
“, title=’” + title + ‘’’ +
“, publishOrg=’” + publishOrg + ‘’’ +
“, docNum=’” + docNum + ‘’’ +
“, publishDate=’” + publishDate + ‘’’ +
“, level=’” + level + ‘’’ +
“, content=’” + content + ‘’’ +
“, levelOrder=” + levelOrder +
“, publishDateMillis=” + publishDateMillis +
“, docId=’” + docId + ‘’’ +
‘}’;
}
}
#####获取文档地址类:
package com.beagledata.gaea.securitydoc.common; - 一级分类
import com.beagledata.gaea.securitydoc.config.DefaultConfigs;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
import java.io.File;
/**
-
Created by liulu on 2019/7/23.
*/
@Component
public class ResourceResolver {
@Autowired
private DefaultConfigs configs;@PostConstruct
public void init() {
File scrcDir = new File(getCSRCPath());
if (!scrcDir.exists()) {
scrcDir.mkdirs();
}
}/**
- @return 获取证监会文档地址
*/
public String getCSRCPath() {
return new File(configs.getAppHome(), “csrc”).getAbsolutePath();
}
}
- @return 获取证监会文档地址
#######工具类:
package com.beagledata.gaea.securitydoc.config;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
/**
-
Created by liulu on 2019/8/12.
/
@Configuration
@ConfigurationProperties(prefix = “config”)
public class DefaultConfigs {
/*- 项目home
*/
private String appHome;
public String getAppHome() {
return appHome;
}public void setAppHome(String appHome) {
this.appHome = appHome;
}@Override
public String toString() {
StringBuilder sb = new StringBuilder(“DefaultConfigs{”);
sb.append(“appHome=’”).append(appHome).append(’’’);
sb.append(’}’);
return sb.toString();
}
} - 项目home
######application.xml配置
config:
app-home: ${SECURITYDOC_HOME:E:\fagui} (配置自己的地址)
#####相关依赖:
com.thetransactioncompany
cors-filter
2.6
org.apache.poi
poi-ooxml
3.14
org.apache.poi
poi
3.14
com.alibaba
fastjson
1.2.8