基于COM的word,execl,powerpoint,pdf文本过滤提取

经过一个多周的煎熬终于推出我的word,execl,powerpoint文档文本提取.现在将代码贴出供大家批评指正,在此参考了一位国外大侠的文章受益非浅,只是那位大侠用的是C#,很多东西都是我通过第六感才解决的.呵呵~~

先在e盘下建一个名为123.doc的文档,然后运行程序就会将该文档的文字提取出来,改个名也可以提取execl和powerpoint的文本,在此不做了,比较懒,呵呵.

另外若要解决其他类型(网页,邮件,二进制文件等等)的文本提取,只要知道相映的过滤dll(e.g.Office用到Offfilt.dll),下载下来注册一下,从注册表中提取相应的类ID,就能实现任何类型文件文本提取.若你稍稍了解COM是怎么回事,这些工作应该难不倒你.我也不废话了.

//CLSID_AND_IID.h

//CLSID
//IFilter对象ID
extern "C" const GUID CLSID_Offfilt =
  { 0xf07f3920, 0x7b8c, 0x11cf,
  { 0x9b, 0xe8, 0x00, 0xaa, 0x00, 0x4b, 0x99, 0x86} } ;

//GetChunk return type
LONG FILTER_E_END_OF_CHUNKS = 0x80041700;

//GetText return type
LONG FILTER_S_LAST_TEXT = 0x00041709;
LONG FILTER_E_NO_MORE_TEXT = 0x80041701;

__________________________________________________________________________

//stdAndy.h

#ifndef __stdAndy_H__
#define __stdAndy_H__

#include "Unknwn.h"
#include "filter.h"

#ifndef String
typedef unsigned short *String;
#endif

#endif // __stdAndy_H__
__________________________________________________________________________

// FindWord.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include "windows.h"
#include <stdio.h>
#include <comutil.h>

#include "CLSID_AND_IID.h"
#include "stdAndy.h"


int main(int argc, char* argv[])
{
 IUnknown *pUnknown;
 IFilter* pFilter;
 IPersistFile* ipf;
 HRESULT hResult;

 if (CoInitialize(NULL) != S_OK) {
  printf("Initialize COM library failed!/n");
  return -1;
 }

 hResult = CoCreateInstance(CLSID_Offfilt, //the CLSID of IFilter
  NULL, CLSCTX_INPROC_SERVER, IID_IUnknown, (void **)&pUnknown);
 if (hResult != S_OK) {
  printf("Create filter object failed!/n");
  return -2;
 }

 hResult = pUnknown->QueryInterface(IID_IFilter, (void **)&pFilter);
 if (hResult != S_OK) {
  pUnknown->Release();
  printf("QueryInterface IFilter failed!/n");
  return -3;
 }

 hResult = pFilter->QueryInterface(IID_IPersistFile,(void**)&ipf);
 if (hResult != S_OK) {
  pUnknown->Release();
  printf("QueryInterface IPersistFile failed!/n");
  return -4;
 }

 hResult = ipf->Load(L"e://123.doc",0UL);
 if (hResult != S_OK) {
  pUnknown->Release();
  printf("QueryInterface IPersistFile failed!/n");
  return -4;
 }

 printf("Begin to work!/n");

 //the flags for GetChunk
    ULONG iflags =
     IFILTER_INIT_CANON_HYPHENS |
     IFILTER_INIT_CANON_PARAGRAPHS |
     IFILTER_INIT_CANON_SPACES |
     IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
     IFILTER_INIT_HARD_LINE_BREAKS;
 ULONG flags=0;

 hResult = pFilter->Init(iflags, 0, 0, &flags);
 if (hResult != S_OK) {
  pFilter->Release();
  printf("Initialize IFilter failed!/n");
  return -5;
 }

 LONG res;
 const int count = 4096; //the max size of a document in Word(Unicode)
 PWCHAR array = new WCHAR[count]; //the final buffer of all Unicode readed from document
 memset(array,0,count*2);
 int offset = 0;   //index the array position
 BOOL _done=FALSE;
 int charsRead=0;  //the number of readed Unicodes
 UINT endOfChunksCount = 0;//if it great than 1 means having load the last chunk
 PWCHAR _charsLeftFromLastRead = NULL;//the buffer in which left Unicodes from last read
 BOOL _currentChunkValid = FALSE;
 STAT_CHUNK _chunkStat;//description of current chunk

 while(!_done && charsRead<count){
  //
  if(_charsLeftFromLastRead != NULL){
   UINT charsToCopy = ( wcslen(_charsLeftFromLastRead)<(UINT)(count-charsRead) )? wcslen(_charsLeftFromLastRead):count-charsRead;
   wcsncpy(array+(offset+charsRead), _charsLeftFromLastRead, charsToCopy);
   charsRead+=charsToCopy;
   if(charsToCopy<wcslen(_charsLeftFromLastRead)){
    PWCHAR tmp = new WCHAR[wcslen(_charsLeftFromLastRead)-charsToCopy];
    memset(tmp,0,wcslen(tmp)*2);
    wcsncpy(tmp, _charsLeftFromLastRead+charsToCopy, wcslen(tmp));
    _charsLeftFromLastRead=tmp;
   }else{
    delete[] _charsLeftFromLastRead;
    _charsLeftFromLastRead=NULL;
   }

   continue;
  }

  //get the chunks
  if(!_currentChunkValid){
   res = pFilter->GetChunk(&_chunkStat);
 printf("0x%08x/n",res);
   _currentChunkValid=(res==S_OK) && ((_chunkStat.flags & CHUNK_TEXT)!=0);
 printf("0x%08x/n",_chunkStat.flags);
   if(res==FILTER_E_END_OF_CHUNKS)
    endOfChunksCount++;

   if(endOfChunksCount>1)
    _done=TRUE; //no more chuncks available
  }

  //get text
  if(_currentChunkValid){
   ULONG bufLength=(ULONG)(count-charsRead);
   if(bufLength<count*2){
    bufLength=count*2; //read ahead
   }
   PWCHAR buffer=new WCHAR[bufLength];
   memset(buffer,0,bufLength*2);
   res=pFilter->GetText(&bufLength, buffer);


 printf("0x%08x/n",res);
   if(res==S_OK || res==FILTER_S_LAST_TEXT){
    int cRead=(int)bufLength;
    if(cRead+charsRead>count){//add the no char tail to _charsLeftFromLastRead after last read.
     int charsLeft=(cRead+charsRead-count);
     _charsLeftFromLastRead=new WCHAR[charsLeft];
     memset(_charsLeftFromLastRead,0,charsLeft*2);
     wcsncpy(_charsLeftFromLastRead, buffer+(cRead-charsLeft), charsLeft);
     cRead-=charsLeft;
    }else{
     if(_charsLeftFromLastRead != NULL){
      delete[] _charsLeftFromLastRead;
      _charsLeftFromLastRead=NULL;
     }
    }

    wcsncpy(array+(offset+charsRead), buffer, cRead);
    charsRead+=cRead;
   }
   if(res==FILTER_S_LAST_TEXT || res==FILTER_E_NO_MORE_TEXT){
    _currentChunkValid=FALSE;
   }
  }
 }

 char* buffer2 = new char[count*2];
 memset(buffer2,0,count*2);
 WideCharToMultiByte( CP_ACP, WC_COMPOSITECHECK, array, charsRead, buffer2, charsRead*2, NULL, NULL);
 strcat(buffer2,"/n/n");
 printf(buffer2);
 wprintf(array);

 delete[] array;
 delete[] buffer2;

 pFilter->Release();
 CoUninitialize();
 return 0;
}

我想相应的过滤dll的作者是将各种类型的文件格式打包封装了,我曾经下载了pdf文件格式的描述文档,足足上千页,你若是看完了,什么都晚了.在此向编写过滤dll的各位大侠致敬.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值