package com.daren.poi.word;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
public class MSWordExtractor
{
private HWPFDocument msWord;
private HttpURLConnection connection;
private InputStream inputStream;
/*
* 加载HTTP形式的Word文件
*
* */
public void initHttpExtractor(String fileurl)
{
try
{
URL url= new URL(fileurl);
connection=(HttpURLConnection)url.openConnection();
connection.connect();
inputStream=connection.getInputStream();
msWord = new HWPFDocument(inputStream);
} catch (Exception e) {
e.printStackTrace();
}
}
/*
* 加载本地的Word文件
*
* */
public void initLocalExtractor(String filepath)
{
try
{
inputStream = new FileInputStream(filepath);
msWord = new HWPFDocument(inputStream);
}
catch (Exception e)
{
e.printStackTrace();
}
}
/*
* 读取完word资源后,释放应该释放的对象
*
* */
public void destory()
{
try
{
if(connection!=null)
{
connection.disconnect();
}
if(inputStream!=null)
{
inputStream.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
/*
* 获取所有的段落文字
*
* */
public String[] getParagraphTexts()
{
Range range = msWord.getRange();
int numParagraph = range.numParagraphs();
String[] paragraphs = new String[numParagraph];
for (int i = 0; i < numParagraph; i++)
{
Paragraph p = range.getParagraph(i);
paragraphs[i]= new String(p.text());
}
return paragraphs;
}
/*
* 获取Word的所有文字
*
* */
public String getMSWordText()
{
return msWord.getRange().text();
}
//将图片保存到指定的目录,并且将图片内容替换成图片的名字
public void extractImages(String directory)
{
try
{
PicturesTable pTable = msWord.getPicturesTable();
int numCharacterRuns = msWord.getRange().numCharacterRuns();
for (int i = 0; i < numCharacterRuns; i++)
{
CharacterRun characterRun = msWord.getRange().getCharacterRun(i);
if (pTable.hasPicture(characterRun))
{
Picture pic = pTable.extractPicture(characterRun, false);
String fileName = pic.suggestFullFileName();
OutputStream out = new FileOutputStream(new File(directory+ File.separator + fileName));
pic.writeImageContent(out);
characterRun.replaceText(characterRun.text(), fileName);
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
public static void main(String args[])
{
String httpfile="http://bus.vodone.com:8080/ids/test.doc";
MSWordExtractor mshttp=new MSWordExtractor();
mshttp.initHttpExtractor(httpfile);
System.out.println("[===]\n"+mshttp.getMSWordText());
mshttp.extractImages("C:\\");
System.out.println("[===]\n"+mshttp.getMSWordText());
mshttp.destory();
String localfile="c:\\test.doc";
MSWordExtractor lochttp=new MSWordExtractor();
lochttp.initLocalExtractor(localfile);
System.out.println("[===]\n"+lochttp.getMSWordText());
lochttp.extractImages("C:\\");
System.out.println("[===]\n"+lochttp.getMSWordText());
mshttp.destory();
}
}