get deflate stream 的代码

最近忙别的,没空关心pdf了。既然有人需要;找了找,大概是这块了

//定义一个识别stream 的正则表达式;自己定义的,未必通用

Regex streamRegex = new Regex(@"<<[^>]*//FlateDecode[^>]*>>/s*stream/s*/n(.*?)/s*endstream", RegexOptions.Singleline);

。。。。。

               match = streamRegex.Match(pdfContent);
                if (match.Length > 0)
                {
                    pdfContent = match.Groups[1].Value;
                    documentContentStart += (match.Groups[1].Index + 2);
                    strLen = match.Groups[1].Length;
                    if (strLen < 2) {
                        strContent = "";
                    }
                    byte[] bufTemp = new byte[strLen - 2];
                    try
                    {
                        for (int i =0; i < strLen - 2; i++)    
                        {
                            bufTemp[i] = pdfBuf[documentContentStart + i];                                   
                        }

                        MemoryStream ms = new MemoryStream();
                        ms.Write(bufTemp, 0, bufTemp.Length);
                        ms.Position = 0;

//解压缩
                        DeflateStream deStream = new DeflateStream(ms, CompressionMode.Decompress, true);
                        //GZipStream deStream = new GZipStream(ms, CompressionMode.Decompress);
                        deStream.Flush();

                        int nSize = 16 * 1024;
                        byte[] decompressedBuffer = new byte[nSize]; //16*1024 + 256 Maxium
                        int totalCount = deStream.Read(decompressedBuffer, 0, nSize);
                        //int totalCount = ReadAllBytesFromStream(deStream, decompressedBuffer);
                        deStream.Close();
                        pdfContent = Encoding.Default.GetString(decompressedBuffer, 0, totalCount);
                        File.WriteAllText("c:/tmp/pdftxt.txt", pdfContent);
                        //strContent = strContent + "<P>" + pdfContent + "</P>";
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("error inflate string", ex);
                    }
                }
           
            }
           

 

解压出来的文本是要继续分析的;算法比较复杂,需要用到矩阵的计算用来判断各段文本的位置。暂时没空搞了。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值