学习目标:
Java解析word文档,将word文档题库导入
学会word文档的解析,以及各种题型的导入
学习内容:
解析word文档
获取正文文件内容 doc和docx两种解析
- 解析word文档
public static String readWord(String path) {
String buffer = "xxx.docx";
try {
if (path.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(path));
WordExtractor ex = new WordExtractor(is);
buffer = ex.getText();
} else if (path.endsWith("docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
buffer = extractor.getText();
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return buffer;
}
2 获取正文文件内容docx方法
/**
* 获取正文文件内容,docx方法
*
* @param file
* @return
*/
public static Map<String, String> getContentDocx(File file) {
Map<String, String> map = new HashMap();
StringBuffer content = new StringBuffer("");
String result = "0"; // 0表示获取正常,1表示获取异常
InputStream is = null;
Logger logger = null;
try {
//根据需求入参也可以改为文件路径,对应的输入流部分改为new File(路径)即可
is = new FileInputStream(file);
// 2007版本的word
XWPFDocument xwpf = new XWPFDocument(is); // 2007版本,仅支持docx文件处理
List<XWPFParagraph> paragraphs = xwpf.getParagraphs();
if (paragraphs != null && paragraphs.size() > 0) {
for (XWPFParagraph paragraph : paragraphs) {
if (!paragraph.getParagraphText().startsWith(" ")) {
content.append(paragraph.getParagraphText().trim()).append("\r\n");
} else {
content.append(paragraph.getParagraphText());
}
}
}
} catch (Exception e) {
logger.info("docx解析正文异常:" + e);
result = "1"; // 出现异常
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
logger.info("" + e);
}
}
map.put("result", result);
map.put("content", String.valueOf(content));
}
return map;
}
- 获取正文文件内容doc方法
/**
* 获取正文文件内容,doc方法
*
* @param path
* @return
*/
public static Map<String, String> getContentDoc(String path) {
Map<String, String> map = new HashMap();
StringBuffer content = new StringBuffer("");
String result = "0"; // 0表示获取正常,1表示获取异常
InputStream is = null;
Logger logger = null;
try {
is = new FileInputStream(new File(path));
// 2003版本的word
WordExtractor extractor = new WordExtractor(is); // 2003版本 仅doc格式文件可处理,docx文件不可处理
String[] paragraphText = extractor.getParagraphText(); // 获取段落,段落缩进无法获取,可以在前添加空格填充
if (paragraphText != null && paragraphText.length > 0) {
for (String paragraph : paragraphText) {
if (!paragraph.startsWith(" ")) {
content.append(paragraph.trim()).append("\r\n");
} else {
content.append(paragraph);
}
}
}
} catch (Exception e) {
//logger.info("doc解析正文异常:" + e);
result = "1"; // 出现异常
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
logger.info("" + e);
}
}
map.put("result", result);
map.put("content", content.toString());
}
return map;
}
- 利用正则表达式,解析选择题多选题
public static List<Questions> readWord2003(InputStream stream) throws IOException {
HWPFDocument doc = new HWPFDocument(stream);
Range r = doc.getRange();// 得到文档的读取范围
List<Questions> questions=new ArrayList<Questions>();
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);// 获取段落
//获取第一段
Paragraph p1=r.getParagraph(0);
String str=p.text();//获取段落内容
String nr=p1.text();
if(nr.contains("选择题")){
if(str.length()>6){
String[] strings1=str.split("\\d\\.");
for(int k=0;k<strings1.length;k++){
String dluo=strings1[k];
if(dluo.length()>6){
String[] sz=dluo.split("\u000B");
//获取题干
String tmw=sz[0].replaceAll("\\(.*?\\)|\\{.*?}|\\[.*?]|(.*?)", "()");
//选项内容
List<String> options=new ArrayList<String>();
//获取答案
List<String> qz= extractMessageByRegular(sz[0]);
String ans=qz.get(0).trim();
if(ans.equals("A")){
ans=String.valueOf(1);
}
if(ans.equals("B")){
ans=String.valueOf(2);
}
if(ans.equals("C")){
ans=String.valueOf(3);
}
if(ans.equals("D")){
ans=String.valueOf(4);
}
for(int h=1;h< sz.length;h++){
//获取选项
String[] xuanX=sz[h].split("\\D\\.");
for(int f=1;f<xuanX.length;f++){
options.add(xuanX[f]);
}
}
Questions questionsOne=new Questions();
questionsOne.setQtType(String.valueOf(1));
questionsOne.setQtContent(tmw);
questionsOne.setQtOp1(options.get(0));
questionsOne.setQtOp2(options.get(1));
questionsOne.setQtOp3(options.get(2));
questionsOne.setQtOp4(options.get(3));
questionsOne.setQtAnsw(ans);
questions.add(questionsOne);
}
}
}
}
if(nr.contains("多选题")){
if(str.length()>8){
String[] strings1=str.split("\\d\\.");
for(int k=0;k<strings1.length;k++){
String dluo=strings1[k];
if(dluo.length()>6){
String[] sz=dluo.split("\u000B");
//获取题干
String tmw=sz[0].replaceAll("\\(.*?\\)|\\{.*?}|\\[.*?]|(.*?)", "()");
//选项内容
List<String> options=new ArrayList<String>();
//获取答案
List<String> qz= extractMessageByRegular(sz[0]);
String solution= qz.get(0).trim();
List<String> list = new ArrayList<String>(); //定义对象依次存放每一个字符
for(int n = 0; n < solution.length() ; n++){
String ss = solution.substring(n,n+1);
if(ss.equals("A")){
ss="1";
}
if(ss.equals("B")){
ss="2";
}
if(ss.equals("C")){
ss="3";
}
if(ss.equals("D")){
ss="4";
}
list.add(ss);
}
String key= Joiner.on("|").join(list);
for(int h=1;h<sz.length;h++){
//获取选项
String[] xuanX=sz[h].split("\\D\\.");
for(int f=1;f<xuanX.length;f++){
options.add(xuanX[f]);
}
}
Questions questionsOne=new Questions();
questionsOne.setQtType(String.valueOf(1));
questionsOne.setQtContent(tmw);
try {
questionsOne.setQtOp1(options.get(0));
questionsOne.setQtOp2(options.get(1));
questionsOne.setQtOp3(options.get(2));
questionsOne.setQtOp4(options.get(3));
}catch (Exception e){
System.out.println(options);
}
questionsOne.setQtAnsw(key);
questions.add(questionsOne);
}
}
}
}
}
return questions;
}
- main方法调用测试
public class ImportWordPoiServiceImpl {
public static void main(String[] args) throws FileNotFoundException {
String path="/Users/liukun/Downloads/";
readWord(path);
getContentDoc(path);
}