java抽取文本_使用Java基于数据流直接抽取word文本

如下代码是直接基于数据流进行文本抽取,支持word97-word2003版本,之后的版本实际都是xml,抽取文本非常简单,因此在此处不再说明,代码仅供研究学习使用,禁止用于商业用途。

public class WordExtractor {

public static StringBuilder logBytes = new StringBuilder();

public static String bytesToString(byte[] ogiBytes, int start, int length, int fc)

{

StringBuilder content = new StringBuilder();

byte[] bytes = new byte[length];

System.arraycopy(ogiBytes, start, bytes, 0, length);

if(fc == 0)

{

for(int i=0;i

{

if(i == bytes.length - 1)

{

return content.toString();

}

String a = Integer.toHexString(bytes[i+1] & 0xFF);

String b = Integer.toHexString(bytes[i] & 0xFF);

if(a.length() == 1)

{

a = "0"+ a;

}

if(b.length() == 1)

{

b = "0"+ b;

}

String hexStr = a + b;

int ch = Integer.valueOf(hexStr, 16);

content.append( (char)ch );

i++;

}

}

else

{

for(int i=0;i

{

int ch = bytes[i] & 0xFF;

content.append( (char)ch );

}

}

return content.toString();

}

public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)

{

content.append( bytesToString(ogiBytes, start, length, fc) );

}

public static void printLogBytes(List legaled) throws Exception

{

logBytes = new StringBuilder();

logBytes.append(" ========================================================");

for(int a=0;a

{

if(a % 16 == 0)

{

logBytes.append(" ");

}

logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");

}

logBytes.append(" ========================================================");

FileUtil.writeAscFile("E:ytes.txt", logBytes.toString());

}

public static int getOneTable(byte[] ogiBytes, Stream stream, int dirSect1)

{

for(int i=0;i<8;i++)

{

int offsetEntry = (dirSect1 + 1)*512 + i*128;

StringBuilder content = new StringBuilder();

bytesToString(ogiBytes, content, offsetEntry, 64, 0);

if(content.toString().indexOf("1Table") > -1)

{

return offsetEntry;

}

}

return 0;

}

public static void main(String[] args) throws Exception

{

byte[] ogiBytes = FileUtil.readBinFile("D: oolsoletest est-old.doc");

System.out.println("Total bytes: "+ ogiBytes.length);

if(

ogiBytes.length < 8         ||

(ogiBytes[0] & 0xFF) != 208 ||

(ogiBytes[1] & 0xFF) != 207 ||

(ogiBytes[2] & 0xFF) != 17     ||

(ogiBytes[3] & 0xFF) != 224 ||

(ogiBytes[4] & 0xFF) != 161 ||

(ogiBytes[5] & 0xFF) != 177 ||

(ogiBytes[6] & 0xFF) != 26     ||

(ogiBytes[7] & 0xFF) != 225

){

System.out.println("Not the doc file!");

return;

}

StringBuilder content = new StringBuilder();

Stream stream = new Stream(ogiBytes);

int[] offset = new int[1];

offset[0] = 48;

int dirSect1 = stream.getInteger(offset);

int oneTable = getOneTable(ogiBytes, stream, dirSect1);

offset[0] = oneTable + 116;

int startSect = stream.getInteger(offset);

int tableStream = (startSect + 1)*512;

offset[0] = 930;

int fcClx = stream.getInteger(offset);

if(fcClx == -1)

{

System.out.println("This version of doc can not be parsed!");

return;

}

int offsetClx = tableStream + fcClx;

offset[0] = offsetClx + 1;

int lcb = stream.getInteger(offset);

int countPcd = (lcb - 4)/12;

int countCp = (lcb - countPcd*8)/4;

int offsetPlcpcd = offsetClx + 5;

for(int i=0;i

{

int offsetPcd = offsetPlcpcd + countCp*4 + i*8;

offset[0] = offsetPcd + 2;

int start = stream.getInteger(offset);

int fc = start >> 30;

start = (start << 2) >> 2;

offset[0] = offsetPlcpcd + i*4;

int cpPre = stream.getInteger(offset);

int cpNext = stream.getInteger(offset);

int length = cpNext - cpPre -1;

if(fc == 0)

{

length *= 2;

}

else

{

start = start/2;

}

start += 512;

bytesToString(ogiBytes, content, start, length, fc);

System.out.println(start +", "+ length);

}

FileUtil.writeAscFile("E:output.txt", content.toString(), false);

System.out.println("Done!");

}

}

参考资料:

http://blog.csdn.net/hu0406/article/details/3157192

http://msdn.microsoft.com/zh-cn/library/gg615596.aspx

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值