java读取ppt数据_使用Java基于数据流直接抽取ppt文本

该博客介绍了一个Java程序,用于直接从PPT文件中通过数据流读取和抽取文本内容。程序首先检查文件头来确认是否为PPT文件,然后定位到PowerPoint文档的起始位置,接着查找并解析文本记录,最终将提取的文本写入输出文件。
摘要由CSDN通过智能技术生成

1.[代码][Java]代码

public class PPTExtractor {

public static StringBuilder logBytes = new StringBuilder();

public static int getPPTDcoument(byte[] ogiBytes, Stream stream, int dirSect1)

{

for(int i=0;i<8;i++)

{

int offsetEntry = (dirSect1 + 1)*512 + i*128;

StringBuilder content = new StringBuilder();

bytesToString(ogiBytes, content, offsetEntry, 64, 0);

if(content.toString().indexOf("PowerPoint Document") > -1)

{

return offsetEntry;

}

}

return 0;

}

public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)

{

byte[] bytes = new byte[length];

System.arraycopy(ogiBytes, start, bytes, 0, length);

if(fc == 0)

{

for(int i=0;i

{

if(i == bytes.length - 1)

{

return;

}

String hexStr = Integer.toHexString(bytes[i+1] & 0xFF) + Integer.toHexString(bytes[i] & 0xFF);

int ch = Integer.valueOf(hexStr, 16);

content.append( (char)ch );

i++;

}

}

else

{

for(int i=0;i

{

int ch = bytes[i] & 0xFF;

content.append( (char)ch );

}

}

}

public static void printLogBytes(List legaled) throws Exception

{

logBytes = new StringBuilder();

logBytes.append("\n========================================================");

for(int a=0;a

{

if(a % 16 == 0)

{

logBytes.append("\n");

}

logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");

}

logBytes.append("\n========================================================");

FileUtil.writeAscFile("E:\\bytes.txt", logBytes.toString());

}

public static int findTextRecords(Stream stream, byte[] bytes, int start, StringBuilder content, int[] offset)

{

byte opt = bytes[start];

int container = opt & 0x0f;

if(container == 0x0f)

{

return start+8;

}

offset[0] = start + 2;

int type = stream.getShort(offset);

offset[0] = start + 4;

int len = stream.getInteger(offset);

if(type == 0x0FA8)

{

bytesToString(bytes, content, start+8, len, 1);

System.out.println("Text Bytes Atom found!");

}

if(type == 0x0FA0)

{

bytesToString(bytes, content, start+8, len, 0);

System.out.println("Text Chars Atom found!");

}

int newStart = start + 8 + len;

if(newStart > bytes.length - 8)

{

newStart = -1;

}

return newStart;

}

public static void main(String[] args) throws Exception

{

byte[] ogiBytes = FileUtil.readBinFile("D:\\tools\\oletest\\cn-t.ppt");

System.out.println("Total bytes: "+ ogiBytes.length);

if(

ogiBytes.length < 8 ||

(ogiBytes[0] & 0xFF) != 208 ||

(ogiBytes[1] & 0xFF) != 207 ||

(ogiBytes[2] & 0xFF) != 17 ||

(ogiBytes[3] & 0xFF) != 224 ||

(ogiBytes[4] & 0xFF) != 161 ||

(ogiBytes[5] & 0xFF) != 177 ||

(ogiBytes[6] & 0xFF) != 26 ||

(ogiBytes[7] & 0xFF) != 225

){

System.out.println("Not the ppt file!");

return;

}

Stream stream = new Stream(ogiBytes);

int[] offset = new int[1];

offset[0] = 48;

int dirSect1 = stream.getInteger(offset);

int pptDocument = getPPTDcoument(ogiBytes, stream, dirSect1);

if(pptDocument <= 0)

{

System.out.println("This version of ppt can not be parsed!");

return;

}

offset[0] = pptDocument + 116;

int startSect = stream.getInteger(offset);

int docStart = (startSect + 1)*512;

int docLength = stream.getInteger(offset);

byte[] bytes = new byte[docLength];

System.arraycopy(ogiBytes, docStart, bytes, 0, docLength);

stream = new Stream(bytes);

StringBuilder content = new StringBuilder();

int start = 0;

while(start != -1)

{

start = findTextRecords(stream, bytes, start, content, offset);

}

FileUtil.writeAscFile("E:\\output.txt", content.toString(), false);

System.out.println("Done!");

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值