基于COM的word,execl,powerpoint,pdf文本过滤提取

最新推荐文章于 2024-08-30 16:06:34 发布

SworderAndy

最新推荐文章于 2024-08-30 16:06:34 发布

阅读量2.3k

点赞数

文章标签： powerpoint buffer filter null delete attributes

本文链接：https://blog.csdn.net/SworderAndy/article/details/2172974

版权

经过一个多周的煎熬终于推出我的word,execl,powerpoint文档文本提取.现在将代码贴出供大家批评指正,在此参考了一位国外大侠的文章受益非浅,只是那位大侠用的是C#,很多东西都是我通过第六感才解决的.呵呵~~

先在e盘下建一个名为123.doc的文档,然后运行程序就会将该文档的文字提取出来,改个名也可以提取execl和powerpoint的文本,在此不做了,比较懒,呵呵.

另外若要解决其他类型(网页,邮件,二进制文件等等)的文本提取,只要知道相映的过滤dll(e.g.Office用到Offfilt.dll),下载下来注册一下,从注册表中提取相应的类ID,就能实现任何类型文件文本提取.若你稍稍了解COM是怎么回事,这些工作应该难不倒你.我也不废话了.

//CLSID_AND_IID.h

//CLSID
//IFilter对象ID
extern "C" const GUID CLSID_Offfilt =
{ 0xf07f3920, 0x7b8c, 0x11cf,
{ 0x9b, 0xe8, 0x00, 0xaa, 0x00, 0x4b, 0x99, 0x86} } ;

//GetChunk return type
LONG FILTER_E_END_OF_CHUNKS = 0x80041700;

//GetText return type
LONG FILTER_S_LAST_TEXT = 0x00041709;
LONG FILTER_E_NO_MORE_TEXT = 0x80041701;

__________________________________________________________________________

//stdAndy.h

#ifndef __stdAndy_H__
#define __stdAndy_H__

#include "Unknwn.h"
#include "filter.h"

#ifndef String
typedef unsigned short *String;
#endif

#endif // __stdAndy_H__
__________________________________________________________________________

// FindWord.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include "windows.h"
#include <stdio.h>
#include <comutil.h>

#include "CLSID_AND_IID.h"
#include "stdAndy.h"

int main(int argc, char* argv[])
{
IUnknown *pUnknown;
IFilter* pFilter;
IPersistFile* ipf;
HRESULT hResult;

if (CoInitialize(NULL) != S_OK) {
printf("Initialize COM library failed!/n");
return -1;
}

hResult = CoCreateInstance(CLSID_Offfilt, //the CLSID of IFilter
  NULL, CLSCTX_INPROC_SERVER, IID_IUnknown, (void **)&pUnknown);
if (hResult != S_OK) {
  printf("Create filter object failed!/n");
  return -2;
}

hResult = pUnknown->QueryInterface(IID_IFilter, (void **)&pFilter);
if (hResult != S_OK) {
  pUnknown->Release();
  printf("QueryInterface IFilter failed!/n");
  return -3;
}

hResult = pFilter->QueryInterface(IID_IPersistFile,(void**)&ipf);
if (hResult != S_OK) {
  pUnknown->Release();
  printf("QueryInterface IPersistFile failed!/n");
  return -4;
}

hResult = ipf->Load(L"e://123.doc",0UL);
if (hResult != S_OK) {
  pUnknown->Release();
  printf("QueryInterface IPersistFile failed!/n");
  return -4;
}

printf("Begin to work!/n");

//the flags for GetChunk
    ULONG iflags =
     IFILTER_INIT_CANON_HYPHENS |
     IFILTER_INIT_CANON_PARAGRAPHS |
     IFILTER_INIT_CANON_SPACES |
     IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
     IFILTER_INIT_HARD_LINE_BREAKS;
ULONG flags=0;

hResult = pFilter->Init(iflags, 0, 0, &flags);
if (hResult != S_OK) {
  pFilter->Release();
  printf("Initialize IFilter failed!/n");
  return -5;
}

LONG res;
const int count = 4096; //the max size of a document in Word(Unicode)
PWCHAR array = new WCHAR[count]; //the final buffer of all Unicode readed from document
memset(array,0,count*2);
int offset = 0; //index the array position
BOOL _done=FALSE;
int charsRead=0; //the number of readed Unicodes
UINT endOfChunksCount = 0;//if it great than 1 means having load the last chunk
PWCHAR _charsLeftFromLastRead = NULL;//the buffer in which left Unicodes from last read
BOOL _currentChunkValid = FALSE;
STAT_CHUNK _chunkStat;//description of current chunk

while(!_done && charsRead<count){
  //
  if(_charsLeftFromLastRead != NULL){
   UINT charsToCopy = ( wcslen(_charsLeftFromLastRead)<(UINT)(count-charsRead) )? wcslen(_charsLeftFromLastRead):count-charsRead;
   wcsncpy(array+(offset+charsRead), _charsLeftFromLastRead, charsToCopy);
   charsRead+=charsToCopy;
   if(charsToCopy<wcslen(_charsLeftFromLastRead)){
    PWCHAR tmp = new WCHAR[wcslen(_charsLeftFromLastRead)-charsToCopy];
    memset(tmp,0,wcslen(tmp)*2);
    wcsncpy(tmp, _charsLeftFromLastRead+charsToCopy, wcslen(tmp));
    _charsLeftFromLastRead=tmp;
   }else{
    delete[] _charsLeftFromLastRead;
    _charsLeftFromLastRead=NULL;
   }

continue;
}

  //get the chunks
  if(!_currentChunkValid){
   res = pFilter->GetChunk(&_chunkStat);
printf("0x%08x/n",res);
   _currentChunkValid=(res==S_OK) && ((_chunkStat.flags & CHUNK_TEXT)!=0);
printf("0x%08x/n",_chunkStat.flags);
   if(res==FILTER_E_END_OF_CHUNKS)
    endOfChunksCount++;

   if(endOfChunksCount>1)
    _done=TRUE; //no more chuncks available
  }

  //get text
  if(_currentChunkValid){
   ULONG bufLength=(ULONG)(count-charsRead);
   if(bufLength<count*2){
    bufLength=count*2; //read ahead
   }
   PWCHAR buffer=new WCHAR[bufLength];
   memset(buffer,0,bufLength*2);
   res=pFilter->GetText(&bufLength, buffer);

printf("0x%08x/n",res);
   if(res==S_OK || res==FILTER_S_LAST_TEXT){
    int cRead=(int)bufLength;
    if(cRead+charsRead>count){//add the no char tail to _charsLeftFromLastRead after last read.
     int charsLeft=(cRead+charsRead-count);
     _charsLeftFromLastRead=new WCHAR[charsLeft];
     memset(_charsLeftFromLastRead,0,charsLeft*2);
     wcsncpy(_charsLeftFromLastRead, buffer+(cRead-charsLeft), charsLeft);
     cRead-=charsLeft;
    }else{
     if(_charsLeftFromLastRead != NULL){
      delete[] _charsLeftFromLastRead;
      _charsLeftFromLastRead=NULL;
     }
    }

    wcsncpy(array+(offset+charsRead), buffer, cRead);
    charsRead+=cRead;
   }
   if(res==FILTER_S_LAST_TEXT || res==FILTER_E_NO_MORE_TEXT){
    _currentChunkValid=FALSE;
   }
  }
}

char* buffer2 = new char[count*2];
memset(buffer2,0,count*2);
WideCharToMultiByte( CP_ACP, WC_COMPOSITECHECK, array, charsRead, buffer2, charsRead*2, NULL, NULL);
strcat(buffer2,"/n/n");
printf(buffer2);
wprintf(array);

delete[] array;
delete[] buffer2;

pFilter->Release();
CoUninitialize();
return 0;
}

我想相应的过滤dll的作者是将各种类型的文件格式打包封装了,我曾经下载了pdf文件格式的描述文档,足足上千页,你若是看完了,什么都晚了.在此向编写过滤dll的各位大侠致敬.

SworderAndy

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
基于COM的word,execl,powerpoint,pdf文本过滤提取

经过一个多周的煎熬终于推出我的word,execl,powerpoint文档文本提取.现在将代码贴出供大家批评指正,在此参考了一位国外大侠的文章受益非浅,只是那位大侠用的是C#,很多东西都是我通过第六感才解决的.呵呵~~先在e盘下建一个名为123.doc的文档,然后运行程序就会将该文档的文字提取出来,改个名也可以提取execl和powerpoint的文本,在此不做了,比较懒,呵呵.另外若要
复制链接

扫一扫