使用Java基于数据流直接抽取word文本

[代码] [Java]代码

001public class WordExtractor {
002  
003    public static StringBuilder logBytes = new StringBuilder();
004  
005    public static String bytesToString(byte[] ogiBytes, int start, int length, int fc)
006    {
007        StringBuilder content = new StringBuilder();
008        byte[] bytes = new byte[length];
009        System.arraycopy(ogiBytes, start, bytes, 0, length);
010        if(fc == 0)
011        {
012            for(int i=0;i<bytes.length;i++)
013            {
014                if(i == bytes.length - 1)
015                {
016                    return content.toString();
017                }
018              
019                String a = Integer.toHexString(bytes[i+1] & 0xFF);
020                String b = Integer.toHexString(bytes[i] & 0xFF);
021                if(a.length() == 1)
022                {
023                    a = "0"+ a;
024                }
025              
026                if(b.length() == 1)
027                {
028                    b = "0"+ b;
029                }
030              
031                String hexStr = a + b;
032                int ch = Integer.valueOf(hexStr, 16);
033                content.append( (char)ch );
034                i++;
035            }
036        }
037        else
038        {
039            for(int i=0;i<bytes.length;i++)
040            {
041                int ch = bytes[i] & 0xFF;
042                content.append( (char)ch );
043            }
044        }
045      
046        return content.toString();
047    }
048  
049    public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)
050    {
051        content.append( bytesToString(ogiBytes, start, length, fc) );
052    }
053  
054    public static void printLogBytes(List<Byte> legaled) throws Exception
055    {
056        logBytes = new StringBuilder();
057      
058        logBytes.append(" ========================================================");
059        for(int a=0;a<legaled.size();a++)
060        {
061            if(a % 16 == 0)
062            {
063                logBytes.append(" ");
064            }
065            logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");
066        }
067        logBytes.append(" ========================================================");
068      
069        FileUtil.writeAscFile("E:ytes.txt", logBytes.toString());
070    }
071  
072    public static int getOneTable(byte[] ogiBytes, Stream stream, int dirSect1)
073    {
074        for(int i=0;i<8;i++)
075        {
076            int offsetEntry = (dirSect1 + 1)*512 + i*128;
077            StringBuilder content = new StringBuilder();
078            bytesToString(ogiBytes, content, offsetEntry, 64, 0);
079            if(content.toString().indexOf("1Table") > -1)
080            {
081                return offsetEntry;
082            }
083        }
084      
085        return 0;
086    }
087  
088    public static void main(String[] args) throws Exception
089    {
090        byte[] ogiBytes = FileUtil.readBinFile("D: oolsoletest est-old.doc");
091      
092        System.out.println("Total bytes: "+ ogiBytes.length);
093        if(
094                ogiBytes.length < 8         ||
095                (ogiBytes[0] & 0xFF) != 208 ||
096                (ogiBytes[1] & 0xFF) != 207 ||
097                (ogiBytes[2] & 0xFF) != 17     ||
098                (ogiBytes[3] & 0xFF) != 224 ||
099                (ogiBytes[4] & 0xFF) != 161 ||
100                (ogiBytes[5] & 0xFF) != 177 ||
101                (ogiBytes[6] & 0xFF) != 26     ||
102                (ogiBytes[7] & 0xFF) != 225
103        ){
104            System.out.println("Not the doc file!");
105            return;
106        }
107      
108        StringBuilder content = new StringBuilder();
109      
110        Stream stream = new Stream(ogiBytes);
111        int[] offset = new int[1];
112      
113        offset[0] = 48;
114        int dirSect1 = stream.getInteger(offset);
115        int oneTable = getOneTable(ogiBytes, stream, dirSect1);
116      
117        offset[0] = oneTable + 116;
118        int startSect = stream.getInteger(offset);
119        int tableStream = (startSect + 1)*512;
120      
121        offset[0] = 930;
122        int fcClx = stream.getInteger(offset);
123        if(fcClx == -1)
124        {
125            System.out.println("This version of doc can not be parsed!");
126            return;
127        }
128      
129        int offsetClx = tableStream + fcClx;
130      
131        offset[0] = offsetClx + 1;
132        int lcb = stream.getInteger(offset);
133     
134        int countPcd = (lcb - 4)/12;
135        int countCp = (lcb - countPcd*8)/4;
136        int offsetPlcpcd = offsetClx + 5;
137     
138        for(int i=0;i<countPcd;i++)
139        {
140            int offsetPcd = offsetPlcpcd + countCp*4 + i*8;
141          
142            offset[0] = offsetPcd + 2;
143            int start = stream.getInteger(offset);
144            int fc = start >> 30;
145            start = (start << 2) >> 2;
146     
147            offset[0] = offsetPlcpcd + i*4;
148            int cpPre = stream.getInteger(offset);
149            int cpNext = stream.getInteger(offset);
150            int length = cpNext - cpPre -1;
151            if(fc == 0)
152            {
153                length *= 2;
154            }
155            else
156            {
157                start = start/2;
158            }
159          
160            start += 512;
161            bytesToString(ogiBytes, content, start, length, fc);
162      
163            System.out.println(start +", "+ length);
164        }
165      
166        FileUtil.writeAscFile("E:output.txt", content.toString(), false);
167      
168        System.out.println("Done!");
169      
170    }
171}

[代码] FileUtil

01import java.io.FileInputStream;
02import java.io.FileOutputStream;
03import java.io.InputStreamReader;
04import java.io.OutputStreamWriter;
05 
06public class FileUtil {
07   
08    public static byte[] readBinFile(String path) throws Exception
09    {
10        FileInputStream stream = new FileInputStream(path);
11       
12        int len = stream.available();
13        byte[] buffer = new byte[len];
14        stream.read(buffer);
15        stream.close();
16       
17        return buffer;
18    }
19   
20    public static String readAscFile(String path) throws Exception
21    {
22        InputStreamReader reader = new InputStreamReader(new FileInputStream(path), "UTF-8");
23        StringBuilder sb = new StringBuilder();
24 
25        int ch = 0;
26        while((ch = reader.read()) != -1)
27        {
28            sb.append( (char)ch );
29        }
30        reader.close();
31       
32        return sb.toString();
33    }
34   
35    public synchronized static void writeBinFile(String path, byte[] buffer) throws Exception
36    {
37        FileOutputStream output = new FileOutputStream(path, true);    
38        output.write(buffer);
39        output.flush();
40        output.close();
41    }
42   
43    public synchronized static void writeAscFile(String path, String content) throws Exception
44    {
45        writeAscFile(path, content, true);
46    }
47   
48    public synchronized static void writeAscFile(String path, String content, boolean append) throws Exception
49    {
50        FileOutputStream output = new FileOutputStream(path, append);
51        OutputStreamWriter writer = new OutputStreamWriter(output, "UTF-8");
52       
53        writer.append(content);
54        writer.flush();
55        writer.close();
56    }
57}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值