最近忙别的,没空关心pdf了。既然有人需要;找了找,大概是这块了
//定义一个识别stream 的正则表达式;自己定义的,未必通用
Regex streamRegex = new Regex(@"<<[^>]*//FlateDecode[^>]*>>/s*stream/s*/n(.*?)/s*endstream", RegexOptions.Singleline);
。。。。。
match = streamRegex.Match(pdfContent);
if (match.Length > 0)
{
pdfContent = match.Groups[1].Value;
documentContentStart += (match.Groups[1].Index + 2);
strLen = match.Groups[1].Length;
if (strLen < 2) {
strContent = "";
}
byte[] bufTemp = new byte[strLen - 2];
try
{
for (int i =0; i < strLen - 2; i++)
{
bufTemp[i] = pdfBuf[documentContentStart + i];
}
MemoryStream ms = new MemoryStream();
ms.Write(bufTemp, 0, bufTemp.Length);
ms.Position = 0;
//解压缩
DeflateStream deStream = new DeflateStream(ms, CompressionMode.Decompress, true);
//GZipStream deStream = new GZipStream(ms, CompressionMode.Decompress);
deStream.Flush();
int nSize = 16 * 1024;
byte[] decompressedBuffer = new byte[nSize]; //16*1024 + 256 Maxium
int totalCount = deStream.Read(decompressedBuffer, 0, nSize);
//int totalCount = ReadAllBytesFromStream(deStream, decompressedBuffer);
deStream.Close();
pdfContent = Encoding.Default.GetString(decompressedBuffer, 0, totalCount);
File.WriteAllText("c:/tmp/pdftxt.txt", pdfContent);
//strContent = strContent + "<P>" + pdfContent + "</P>";
}
catch (Exception ex)
{
throw new Exception("error inflate string", ex);
}
}
}
解压出来的文本是要继续分析的;算法比较复杂,需要用到矩阵的计算用来判断各段文本的位置。暂时没空搞了。