根据execl标题信息匹配word内容读取为json并输出到txt

####实现类:
package com.beagledata.gaea.securitydoc.service.impl;

import com.alibaba.fastjson.JSONObject;
import com.beagledata.gaea.securitydoc.common.ResourceResolver;
import com.beagledata.gaea.securitydoc.common.Result;
import com.beagledata.gaea.securitydoc.entity.ReadText;
import com.beagledata.gaea.securitydoc.service.ReadTextService;
import com.beagledata.utils.EncodeUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**

  • Created by mahongfei on 2019/8/22.
    */
    @Service
    public class ReadTextServiceImpl implements ReadTextService{
    private Logger logger = LoggerFactory.getLogger(this.getClass());
    @Autowired
    private ResourceResolver resourceResolver;

    /**

    • @Author: mahongfei
    • @description: 读取word文档和execl文档为json并输出到txt
      */
      @Override
      public Result<List> readText() {
      return Result.newSuccess().withData(parseText(getWordText()));
      }

    /**

    • @Author: mahongfei
    • @description: 读取docx文档
      */
      public String getWordText() {
      String text = “”;
      String filePath = resourceResolver.getCSRCPath() + “/” + “证监会9.docx”; //word文档地址
      try {
      if (filePath.endsWith(".docx")) { //docx为后缀的
      XWPFWordExtractor docx = new XWPFWordExtractor(POIXMLDocument.openPackage(filePath));
      text = docx.getText();
      } else {
      logger.info(“该文档不是docx结尾的”);
      }
      } catch (Exception e) {
      logger.info(“读取docx文档错误”);
      logger.error(e.getLocalizedMessage(), e);
      }
      return text;
      }

    /**

    • @Author: mahongfei

    • @description: 解析word和execl文档内容
      */
      public List parseText(String text){
      List list = new ArrayList<>();
      List list1 = readExecl();
      List list2 = new ArrayList<>();
      try {
      String categoryTexts[] = text.split(“◎”);
      for (int i = 0; i< categoryTexts.length; i++) {
      // List list1 = new ArrayList<>(); //分类获取文档
      String categoryText = categoryTexts[i];
      String documents[] = categoryText.split("#conend#");
      for (int j = 0; j < documents.length; j++) {
      String document = documents[j];
      if (StringUtils.isNotBlank(document)) {
      if (document.contains("#title#")) {
      String title = document.substring(document.indexOf(".") + 1, document.indexOf("#title#"));
      title = title.replaceAll("\n", “”);
      String content = document.substring(document.indexOf("#constart#")+ 10);
      content = content.replaceAll("\t", “”);
      content = parseBr(content, “\n”);
      content = content.replaceAll("\n", “
      ”);
      for (ReadText readText : list1) {
      if (title.trim().equals(readText.getTitle().trim())) {
      readText.setContent(content);
      if (list2.contains(title.trim())){
      System.out.println(“重复的docx文章:” + title.trim());
      } else {
      list2.add(title.trim());
      list.add(readText);
      }
      }
      }
      // list1.add(readText);
      }
      }
      }

          /* StringBuilder sb = new StringBuilder();
           readExecl(list1);
           sb.append("[");
           for (int k = 0; k < list1.size(); k++) {
               ReadText readText = list1.get(k);
               JSONObject jsonObject = new JSONObject();
               if (k == list1.size()-1) {
                   sb.append(jsonObject.toJSONString(readText));
               } else {
                   sb.append(jsonObject.toJSONString(readText)).append(",");
               }
           }
      
           sb.append("]");
           String filePath = resourceResolver.getCSRCPath() + i +  ".txt";//输出的json地址
           File file = new File(filePath);
           if (!file.exists()) {
               file.createNewFile();
           }
           FileOutputStream fos = new FileOutputStream(filePath);
           fos.write(sb.toString().getBytes());
           fos.close();*/
       }
       StringBuilder sb = new StringBuilder();
       System.out.println("文档数量:" + list.size());
       sb.append("[");
       for (int k = 0; k < list.size(); k++) {
           ReadText readText = list.get(k);
           JSONObject jsonObject = new JSONObject();
           if (k == list.size()-1) {
               sb.append(jsonObject.toJSONString(readText));
           } else {
               sb.append(jsonObject.toJSONString(readText)).append(",");
           }
       }
      
       sb.append("]");
       String filePath = resourceResolver.getCSRCPath() + "/" +  "证监会.txt";//输出的json地址
       File file = new File(filePath);
       if (!file.exists()) {
           file.createNewFile();
       }
       FileOutputStream fos = new FileOutputStream(filePath);
       fos.write(sb.toString().getBytes());
       fos.close();
      

      } catch (Exception e) {
      logger.info(“对获取到的文档信息解析错误”);
      logger.error(e.getLocalizedMessage(), e);
      }
      return list;
      }

    /**

    • @Author: mahongfei
    • @description: 消除正文前后字符
      /
      public String parseBr(String srcStr, String splitter) {
      String regex = “^” + splitter + "
      |" + splitter + “*$”;
      return srcStr.replaceAll(regex, “”);
      }

    /**

    • @Author: mahongfei

    • @description: 根据word文档标题获取execl文档中的其他信息
      */
      public List readExecl() {
      List list = new ArrayList<>();
      List list1 = new ArrayList<>();
      try {
      String filePath = resourceResolver.getCSRCPath() + “/” + “证监会.xlsx”; //execl文件地址
      File file = new File(filePath);
      InputStream input = new FileInputStream(file);
      boolean isE2007 = false;
      //判断是否是excel2007格式
      if(filePath.endsWith(“xlsx”)){
      isE2007 = true;
      }

       Workbook wb;
       //根据文件格式(2003或者2007)来初始化
       if(isE2007){
           wb = new XSSFWorkbook(input);
       }else{
           wb = new HSSFWorkbook(input);
       }
       Sheet sheet = wb.getSheetAt(0);    //获得第一个表单
       int rowCount = sheet.getLastRowNum()+1;
       for(int i = 0; i < rowCount;i++){
           Row row ;
           row = sheet.getRow(i);
           ReadText readText = new ReadText();
           readText.setTag1(String.valueOf(row.getCell(0)));
           readText.setTag2(String.valueOf(row.getCell(1)));
           readText.setTag3(String.valueOf(row.getCell(2)));
           readText.setTag4(String.valueOf(row.getCell(3)));
           readText.setTitle(String.valueOf(row.getCell(4)).trim());
           readText.setPublishOrg(String.valueOf(row.getCell(5)));
           readText.setDocNum(String.valueOf(row.getCell(6)));
           readText.setPublishDate(subDate(String.valueOf(row.getCell(7))));
           readText.setLevel(String.valueOf(row.getCell(8)));
           readText.setPublishDateMillis(parseTimestamp(readText.getPublishDate()));
           readText.setLevelOrder(parseLevelOrder(readText.getLevel()));
           readText.setDocId(parseDocId(readText.getTitle(), readText.getDocNum()));
           if (!list1.contains(readText.getTitle().trim())) {
               list1.add(readText.getTitle().trim());
               list.add(readText);
           }
       }
      

      } catch (Exception e) {
      logger.info(“获取execl文档信息错误”);
      logger.error(e.getLocalizedMessage(), e);
      }
      return list;
      }

    /**

    • @Author: mahongfei
    • @description: 拼接替换的日期
      */
      public String subDate (String date) {
      if (date.contains(“月”) && date.contains("-")) {
      date = parseDate(date);
      String day = date.substring(0, date.indexOf("-"));
      String month = date.substring(date.indexOf("-")+1, date.lastIndexOf("-"));
      String year = date.substring(date.lastIndexOf("-")+1);
      date = year + “-” + month + “-” + day;
      } else if (date.contains(“月”) && date.contains("/")) {
      date = parseDate(date);
      String day = date.substring(0, date.indexOf("/"));
      String month = date.substring(date.indexOf("/")+1, date.lastIndexOf("/"));
      String year = date.substring(date.lastIndexOf("/")+1);
      date = year + “/” + month + “/” + day;
      }
      return date;
      }

    /**

    • @Author: mahongfei

    • @description: 替换日期汉字为数字
      */
      public String parseDate (String date) {
      if (date.contains(“十二月”)) {
      date = date.replaceAll(“十二月”, “12”);
      } else if (date.contains(“十一月”)) {
      date = date.replaceAll(“十一月”, “11”);
      } else if (date.contains(“十月”)) {
      date = date.replaceAll(“十月”, “10”);
      } else if (date.contains(“九月”)) {
      date = date.replaceAll(“九月”, “09”);
      } else if (date.contains(“八月”)) {
      date = date.replaceAll(“八月”, “08”);
      } else if (date.contains(“七月”)) {
      date = date.replaceAll(“七月”, “07”);
      } else if (date.contains(“六月”)) {
      date = date.replaceAll(“六月”, “06”);
      } else if (date.contains(“五月”)) {
      date = date.replaceAll(“五月”, “05”);
      } else if (date.contains(“四月”)) {
      date = date.replaceAll(“四月”, “04”);
      } else if (date.contains(“三月”)) {
      date = date.replaceAll(“三月”, “03”);
      } else if (date.contains(“二月”)) {
      date = date.replaceAll(“二月”, “02”);
      } else if (date.contains(“一月”)) {
      date = date.replaceAll(“一月”, “01”);
      }

      return date;
      }

    /**

    • @Author: mahongfei

    • @description: 获取时间戳
      */
      public long parseTimestamp(String text) {
      try {
      if (text.contains("/")) {
      if (text.contains(“发布”) && text.contains(“修订”)) {
      text = text.substring(text.indexOf(",") + 1, text.indexOf(“修订”));
      return new SimpleDateFormat(“yyyy/MM/dd”).parse(text).getTime();

           } else if ((!text.contains("发布") && text.contains("修订")) ||(!text.contains("发布") && text.contains("修改"))) {
               if (text.contains("修订")) {
                   text = text.substring(0, text.indexOf("修订"));
               } else if (text.contains("修改")){
                   text = text.substring(0, text.indexOf("修改"));
               }
               return new SimpleDateFormat("yyyy/MM/dd").parse(text).getTime();
      
           } else {
               return new SimpleDateFormat("yyyy/MM/dd").parse(text).getTime();
           }
      
       } else if (text.contains("-")) {
           if (text.contains("发布") && text.contains("修订")) {
               text = text.substring(text.indexOf(",") + 1, text.indexOf("修订"));
               return new SimpleDateFormat("yyyy-MM-dd").parse(text).getTime();
      
           } else if ((!text.contains("发布") && text.contains("修订")) ||(!text.contains("发布") && text.contains("修改"))) {
               text = text.substring(0, text.indexOf("修订"));
               return new SimpleDateFormat("yyyy-MM-dd").parse(text).getTime();
      
           } else {
               return new SimpleDateFormat("yyyy-MM-dd").parse(text).getTime();
           }
       }
      

      } catch (Exception e) {
      logger.info(“获取时间戳错误” + text);
      }
      return 0;
      }

    /**

    • @Author: mahongfei
    • @description: 获取levelOrder
      */
      public int parseLevelOrder(String level) {
      List levels = Arrays.asList((“部门规章,部门通知、函等,窗口指导文件,法律,规范性文件,行政法规,其他,司法解释,司法文件,自律规则”).split(","));
      int levelOrder = levels.size() - levels.indexOf(level);
      return levelOrder;
      }

    /**

    • @Author: mahongfei
    • @description: 获取docId
      */
      public String parseDocId(String title, String docNum) {
      String docId = EncodeUtil.encodeMD5(title + docNum);
      if (docId.length() >15) {
      docId = docId.substring(0, 16);
      }
      return docId;
      }
      }

####实体类:
package com.beagledata.gaea.securitydoc.entity;

/**

  • Created by mahongfei on 2019/8/22.
    */
    public class ReadText {
    private static final long serialVersionUID = -7076621449427697937L;

    /**

    • 一级分类
      /
      private String tag1;
      /
      *
    • 二级分类
      /
      private String tag2;
      /
      *
    • 三级分类
      /
      private String tag3;
      /
      *
    • 四级分类
      /
      private String tag4;
      /
      *
    • 名称
      /
      private String title;
      /
      *
    • 发布机构
      /
      private String publishOrg;
      /
      *
    • 文号
      /
      private String docNum;
      /
      *
    • 发布日期
      /
      private String publishDate;
      /
      *
    • 效力层级
      /
      private String level;
      /
      *
    • 正文
      /
      private String content;
      /
      *
    • 层级个数
      /
      private int levelOrder;
      /
      *
    • 时间戳
      /
      private long publishDateMillis;
      /
      *
    • 文档id
    • docId=md5(title+docNum)16位
      */
      private String docId;

    public String getTag1() {
    return tag1;
    }

    public void setTag1(String tag1) {
    this.tag1 = tag1;
    }

    public String getTag2() {
    return tag2;
    }

    public void setTag2(String tag2) {
    this.tag2 = tag2;
    }

    public String getTag3() {
    return tag3;
    }

    public void setTag3(String tag3) {
    this.tag3 = tag3;
    }

    public String getTag4() {
    return tag4;
    }

    public void setTag4(String tag4) {
    this.tag4 = tag4;
    }

    public String getTitle() {
    return title;
    }

    public void setTitle(String title) {
    this.title = title;
    }

    public String getPublishOrg() {
    return publishOrg;
    }

    public void setPublishOrg(String publishOrg) {
    this.publishOrg = publishOrg;
    }

    public String getDocNum() {
    return docNum;
    }

    public void setDocNum(String docNum) {
    this.docNum = docNum;
    }

    public String getPublishDate() {
    return publishDate;
    }

    public void setPublishDate(String publishDate) {
    this.publishDate = publishDate;
    }

    public String getLevel() {
    return level;
    }

    public void setLevel(String level) {
    this.level = level;
    }

    public String getContent() {
    return content;
    }

    public void setContent(String content) {
    this.content = content;
    }

    public int getLevelOrder() {
    return levelOrder;
    }

    public void setLevelOrder(int levelOrder) {
    this.levelOrder = levelOrder;
    }

    public long getPublishDateMillis() {
    return publishDateMillis;
    }

    public void setPublishDateMillis(long publishDateMillis) {
    this.publishDateMillis = publishDateMillis;
    }

    public String getDocId() {
    return docId;
    }

    public void setDocId(String docId) {
    this.docId = docId;
    }

    @Override
    public String toString() {
    return “ReadText{” +
    “tag1=’” + tag1 + ‘’’ +
    “, tag2=’” + tag2 + ‘’’ +
    “, tag3=’” + tag3 + ‘’’ +
    “, tag4=’” + tag4 + ‘’’ +
    “, title=’” + title + ‘’’ +
    “, publishOrg=’” + publishOrg + ‘’’ +
    “, docNum=’” + docNum + ‘’’ +
    “, publishDate=’” + publishDate + ‘’’ +
    “, level=’” + level + ‘’’ +
    “, content=’” + content + ‘’’ +
    “, levelOrder=” + levelOrder +
    “, publishDateMillis=” + publishDateMillis +
    “, docId=’” + docId + ‘’’ +
    ‘}’;
    }
    }
    #####获取文档地址类:
    package com.beagledata.gaea.securitydoc.common;

import com.beagledata.gaea.securitydoc.config.DefaultConfigs;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import javax.annotation.PostConstruct;
import java.io.File;

/**

  • Created by liulu on 2019/7/23.
    */
    @Component
    public class ResourceResolver {
    @Autowired
    private DefaultConfigs configs;

    @PostConstruct
    public void init() {
    File scrcDir = new File(getCSRCPath());
    if (!scrcDir.exists()) {
    scrcDir.mkdirs();
    }
    }

    /**

    • @return 获取证监会文档地址
      */
      public String getCSRCPath() {
      return new File(configs.getAppHome(), “csrc”).getAbsolutePath();
      }
      }

#######工具类:
package com.beagledata.gaea.securitydoc.config;

import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;

/**

  • Created by liulu on 2019/8/12.
    /
    @Configuration
    @ConfigurationProperties(prefix = “config”)
    public class DefaultConfigs {
    /
    *

    • 项目home
      */
      private String appHome;

    public String getAppHome() {
    return appHome;
    }

    public void setAppHome(String appHome) {
    this.appHome = appHome;
    }

    @Override
    public String toString() {
    StringBuilder sb = new StringBuilder(“DefaultConfigs{”);
    sb.append(“appHome=’”).append(appHome).append(’’’);
    sb.append(’}’);
    return sb.toString();
    }
    }

######application.xml配置
config:
app-home: ${SECURITYDOC_HOME:E:\fagui} (配置自己的地址)

#####相关依赖:

com.thetransactioncompany
cors-filter
2.6


org.apache.poi
poi-ooxml
3.14


org.apache.poi
poi
3.14



com.alibaba
fastjson
1.2.8

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值