java 读取word (不带格式)




 <!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
 <dependency>
     <groupId>org.apache.poi</groupId>
     <artifactId>poi</artifactId>
     <version>3.16</version>
 </dependency>
   
   <!--读取word所需依赖  -->
   <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
 <dependency>
     <groupId>org.apache.poi</groupId>
     <artifactId>poi-ooxml</artifactId>
     <version>3.16</version>
 </dependency>
   
   <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
 <dependency>
     <groupId>org.apache.poi</groupId>
     <artifactId>poi-scratchpad</artifactId>
     <version>3.16</version>
 </dependency>
   <!--读取word所需依赖  -->


代码:

package springTimer;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

import springTimer.util.CharIndexUtil;

public class testReadDocx {

 
  /**
     * 读取word文件内容
     * 
     * @param path
     * @return buffer
     */ 
 
    public static String readWord(String path) { 
        String buffer = ""; 
        try { 
            if (path.endsWith(".doc")) { 
                InputStream is = new FileInputStream(new File(path)); 
                WordExtractor ex = new WordExtractor(is); 
                buffer = ex.getText(); 
                ex.close(); 
            } else if (path.endsWith("docx")) { 
                OPCPackage opcPackage = POIXMLDocument.openPackage(path); 
                POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); 
                buffer = extractor.getText(); 
                extractor.close(); 
            } else { 
                System.out.println("此文件不是word文件!"); 
            } 
 
        } catch (Exception e) { 
            e.printStackTrace(); 
        } 
 
        return buffer; 
    } 
   
   
   
    public static void main(String[] args) {
  
     List<File> files = DocumentUtil.getFiles("E:\\zhenglilunbotu\\testdocx\\");
     for (File file : files) {
      
      String path = file.getPath();
      String name = file.getName();
      int indexOf = name.indexOf(".");
      String substring = name.substring(indexOf);
      if(".docx".equals(substring)){
       System.out.println(path);
       
       String content = testReadDocx.readWord(path);
       System.out.println("content===="+content);
            
            
             int index1 = CharIndexUtil.getCharacterPosition(content, ":", 5);
             int index2 = CharIndexUtil.getCharacterPosition(content, ":", 6);
             int index3 = CharIndexUtil.getCharacterPosition(content, ":", 7);

             String sc = content.substring(index1+1, index2-4);
             String jl = content.substring(index2+1, index3-4);
             String ry = content.substring(index3+1);
             System.out.println(sc);
             System.out.println(jl);
             System.out.println(ry);
      }else if(".png".equals(substring)){
       System.out.println(path);
      }
      
  }
 }
}


package springTimer;

import java.io.File;
import java.util.ArrayList;
import java.util.List;


//遍历文件夹下的所有文件
public class DocumentUtil {

 //path:文件的地址  例如:D:\\videoUpload
 public static List<File> getFiles(String path){
     File root = new File(path);
     List<File> files = new ArrayList<File>();
     if(!root.isDirectory()){
         files.add(root);
     }else{
         File[] subFiles = root.listFiles();
         for(File f : subFiles){
             files.addAll(getFiles(f.getAbsolutePath()));
         }   
     }
     return files;
 }
}


package springTimer;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

import springTimer.util.CharIndexUtil;

public class testReadDocx {

 
  /**
     * 读取word文件内容
     * 
     * @param path
     * @return buffer
     */ 
 
    public static String readWord(String path) { 
        String buffer = ""; 
        try { 
            if (path.endsWith(".doc")) { 
                InputStream is = new FileInputStream(new File(path)); 
                WordExtractor ex = new WordExtractor(is); 
                buffer = ex.getText(); 
                ex.close(); 
            } else if (path.endsWith("docx")) { 
                OPCPackage opcPackage = POIXMLDocument.openPackage(path); 
                POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); 
                buffer = extractor.getText(); 
                extractor.close(); 
            } else { 
                System.out.println("此文件不是word文件!"); 
            } 
 
        } catch (Exception e) { 
            e.printStackTrace(); 
        } 
 
        return buffer; 
    } 
   
   
   
    public static void main(String[] args) {
  
     List<File> files = DocumentUtil.getFiles("E:\\zhenglilunbotu\\testdocx\\");
     for (File file : files) {
      
      String path = file.getPath();
      String name = file.getName();
      int indexOf = name.indexOf(".");
      String substring = name.substring(indexOf);
      if(".docx".equals(substring)){
       System.out.println(path);
       
       String content = testReadDocx.readWord(path);
       System.out.println("content===="+content);
            
            
             int index1 = CharIndexUtil.getCharacterPosition(content, ":", 5);
             int index2 = CharIndexUtil.getCharacterPosition(content, ":", 6);
             int index3 = CharIndexUtil.getCharacterPosition(content, ":", 7);

             String sc = content.substring(index1+1, index2-4);
             String jl = content.substring(index2+1, index3-4);
             String ry = content.substring(index3+1);
             System.out.println(sc);
             System.out.println(jl);
             System.out.println(ry);
      }
      
  }
 }
}



word文件

   名:张三

   院:*医院

   室:*内科

   称:*医师

擅长领域:治疗内科常见病、多发病,对心脑血管、胃肠、妇科等病中医药治疗有专长。

执业经历:教授,*************,河北*人。19*年至今在*****医院中医科从事临床医疗、教学、科研工作。

所获荣誉:20*年开展的《*实验研究》获*一等奖(*主研人),发表*5篇,撰写著作6部。








  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值