这篇word文档都是正规的文本文字,有一定的格式,其中没有图片等难以处理的内容
我也是刚学习对word文档的处理,其中也有很对不懂的地方
Apache POI是Apache软件基金会的开放源码函式库,POI提供API给Java程序对Microsoft Office格式档案读和写的功能。
1、首先我下载了poi的包 http://poi.apache.org/download.html 网址
2、然后就是利用函数对文档的处理
读取doc文档
public static String contextOfDoc(File file) { String str = ""; try { FileInputStream fis = new FileInputStream(file); HWPFDocument doc = new HWPFDocument(fis); str = doc.getDocumentText(); doc.close(); fis.close(); } catch (Exception e) { e.printStackTrace(); // TODO: handle exception } return str; }
测试
public static void main(String[] args) { File file = new File("src/1.doc"); String str = contextOfDoc(file); String[] arr = str.split("\r"); for (int i = 9; i < 284; i++) { System.out.println(arr[i]); } }
先切分文档,分为目录和内容
public static String[] cataAndContext() { File file = new File("src/1.doc"); String textAll = docIo.contextOfDoc(file); String[] str = textAll.split("第五篇"); return str; }
对目录和内容分别切分
public static List<String> typePart(String str) { //File file = new File("src/1.doc"); //all //String textAll = docIo.contextOfDoc(file); String[] partOne = str.split("新技术篇"); //第一篇到目录 String partOneCatalog = partOne[1].split("网络安全篇")[0]; String partNest = partOne[1].split("网络安全篇")[1]; //第二篇目录 String partTowCatalog = partNest.split("基础篇")[0]; partNest = partNest.split("基础篇")[1]; //第三篇目录 String partThreeCatalog = partNest.split("国家信息化政策规划篇")[0]; partNest = partNest.split("国家信息化政策规划篇")[1]; //第四篇目录 String partForeCatalog = partNest.split("附录")[0]; List<String> strList = new ArrayList<>(); strList.add(partOneCatalog); strList.add(partTowCatalog); strList.add(partThreeCatalog); strList.add(partForeCatalog); return strList; }
对内容的处理
public static void main(String[] args) throws Exception { FileInputStream fis = new FileInputStream("src/3.doc"); WordExtractor wordExtractor = new WordExtractor(fis); String[] paragraphs = wordExtractor.getParagraphText(); List<String> lists = getParas(paragraphs); CRUD c = new CRUD(); List<String> catas = c.getCatalogs(); for (int i = 0; i < catas.size()-1; i++) { String context = getContext(catas.get(i), catas.get(i+1), lists); c.insertContext(catas.get(i), context); } } public static String getContext(String start,String end,List<String> paras) { String context = ""; for (int i = 0; i < paras.size(); i++) { if (paras.get(i).equals(start)) { for (int j = i+1; j < paras.size(); j++) { if(paras.get(j).equals(end)) { return context; } context = context + paras.get(j); } } } return context; } public static List<String> getParas(String[] paras) { List<String> paraList = new ArrayList<>(); for (int i = 289; i < paras.length; i++) { paraList.add(paras[i].trim()); } return paraList; }
数据库的crud
public List<String> getCatalogs(){ List<String> lists = new ArrayList<>(); Connection connection = Dbuitl.getConnection(); String sql = "select catalog from catalogs"; PreparedStatement preparedStatement = null; ResultSet resultSet = null; try { Statement statement = connection.createStatement(); resultSet = statement.executeQuery(sql); while (resultSet.next()) { lists.add(resultSet.getString("catalog")); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { Dbuitl.close(preparedStatement); Dbuitl.close(connection); } return lists; } public void insert(String type,String cata) { Connection connection = Dbuitl.getConnection(); String sql = "insert into catalogs(type,catalog) value(?,?)"; PreparedStatement preparedStatement = null; ResultSet resultSet = null; try { preparedStatement = connection.prepareStatement(sql); preparedStatement.setString(1, type); preparedStatement.setString(2, cata); preparedStatement.executeUpdate(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { Dbuitl.close(preparedStatement); Dbuitl.close(connection); } } public void insertContext(String catalog,String context) { Connection connection = Dbuitl.getConnection(); String sql = "insert into context(catalog,context) value(?,?)"; PreparedStatement preparedStatement = null; ResultSet resultSet = null; try { preparedStatement = connection.prepareStatement(sql); preparedStatement.setString(1, catalog); preparedStatement.setString(2, context); preparedStatement.executeUpdate(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { Dbuitl.close(preparedStatement); Dbuitl.close(connection); } } public void insertSheet(String sheet,String type) { Connection connection = Dbuitl.getConnection(); String sql = "insert into sheet(sheet,type) value(?,?)"; PreparedStatement preparedStatement = null; ResultSet resultSet = null; try { preparedStatement = connection.prepareStatement(sql); preparedStatement.setString(1, sheet); preparedStatement.setString(2, type); preparedStatement.executeUpdate(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { Dbuitl.close(preparedStatement); Dbuitl.close(connection); } }