经过一个多周的煎熬终于推出我的word,execl,powerpoint文档文本提取.现在将代码贴出供大家批评指正,在此参考了一位国外大侠的文章受益非浅,只是那位大侠用的是C#,很多东西都是我通过第六感才解决的.呵呵~~
先在e盘下建一个名为123.doc的文档,然后运行程序就会将该文档的文字提取出来,改个名也可以提取execl和powerpoint的文本,在此不做了,比较懒,呵呵.
另外若要解决其他类型(网页,邮件,二进制文件等等)的文本提取,只要知道相映的过滤dll(e.g.Office用到Offfilt.dll),下载下来注册一下,从注册表中提取相应的类ID,就能实现任何类型文件文本提取.若你稍稍了解COM是怎么回事,这些工作应该难不倒你.我也不废话了.
//CLSID_AND_IID.h
//CLSID
//IFilter对象ID
extern "C" const GUID CLSID_Offfilt =
{ 0xf07f3920, 0x7b8c, 0x11cf,
{ 0x9b, 0xe8, 0x00, 0xaa, 0x00, 0x4b, 0x99, 0x86} } ;
//GetChunk return type
LONG FILTER_E_END_OF_CHUNKS = 0x80041700;
//GetText return type
LONG FILTER_S_LAST_TEXT = 0x00041709;
LONG FILTER_E_NO_MORE_TEXT = 0x80041701;
__________________________________________________________________________
//stdAndy.h
#ifndef __stdAndy_H__
#define __stdAndy_H__
#include "Unknwn.h"
#include "filter.h"
#ifndef String
typedef unsigned short *String;
#endif
#endif // __stdAndy_H__
__________________________________________________________________________
// FindWord.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "windows.h"
#include <stdio.h>
#include <comutil.h>
#include "CLSID_AND_IID.h"
#include "stdAndy.h"
int main(int argc, char* argv[])
{
IUnknown *pUnknown;
IFilter* pFilter;
IPersistFile* ipf;
HRESULT hResult;
if (CoInitialize(NULL) != S_OK) {
printf("Initialize COM library failed!/n");
return -1;
}
hResult = CoCreateInstance(CLSID_Offfilt, //the CLSID of IFilter
NULL, CLSCTX_INPROC_SERVER, IID_IUnknown, (void **)&pUnknown);
if (hResult != S_OK) {
printf("Create filter object failed!/n");
return -2;
}
hResult = pUnknown->QueryInterface(IID_IFilter, (void **)&pFilter);
if (hResult != S_OK) {
pUnknown->Release();
printf("QueryInterface IFilter failed!/n");
return -3;
}
hResult = pFilter->QueryInterface(IID_IPersistFile,(void**)&ipf);
if (hResult != S_OK) {
pUnknown->Release();
printf("QueryInterface IPersistFile failed!/n");
return -4;
}
hResult = ipf->Load(L"e://123.doc",0UL);
if (hResult != S_OK) {
pUnknown->Release();
printf("QueryInterface IPersistFile failed!/n");
return -4;
}
printf("Begin to work!/n");
//the flags for GetChunk
ULONG iflags =
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_SPACES |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
IFILTER_INIT_HARD_LINE_BREAKS;
ULONG flags=0;
hResult = pFilter->Init(iflags, 0, 0, &flags);
if (hResult != S_OK) {
pFilter->Release();
printf("Initialize IFilter failed!/n");
return -5;
}
LONG res;
const int count = 4096; //the max size of a document in Word(Unicode)
PWCHAR array = new WCHAR[count]; //the final buffer of all Unicode readed from document
memset(array,0,count*2);
int offset = 0; //index the array position
BOOL _done=FALSE;
int charsRead=0; //the number of readed Unicodes
UINT endOfChunksCount = 0;//if it great than 1 means having load the last chunk
PWCHAR _charsLeftFromLastRead = NULL;//the buffer in which left Unicodes from last read
BOOL _currentChunkValid = FALSE;
STAT_CHUNK _chunkStat;//description of current chunk
while(!_done && charsRead<count){
//
if(_charsLeftFromLastRead != NULL){
UINT charsToCopy = ( wcslen(_charsLeftFromLastRead)<(UINT)(count-charsRead) )? wcslen(_charsLeftFromLastRead):count-charsRead;
wcsncpy(array+(offset+charsRead), _charsLeftFromLastRead, charsToCopy);
charsRead+=charsToCopy;
if(charsToCopy<wcslen(_charsLeftFromLastRead)){
PWCHAR tmp = new WCHAR[wcslen(_charsLeftFromLastRead)-charsToCopy];
memset(tmp,0,wcslen(tmp)*2);
wcsncpy(tmp, _charsLeftFromLastRead+charsToCopy, wcslen(tmp));
_charsLeftFromLastRead=tmp;
}else{
delete[] _charsLeftFromLastRead;
_charsLeftFromLastRead=NULL;
}
continue;
}
//get the chunks
if(!_currentChunkValid){
res = pFilter->GetChunk(&_chunkStat);
printf("0x%08x/n",res);
_currentChunkValid=(res==S_OK) && ((_chunkStat.flags & CHUNK_TEXT)!=0);
printf("0x%08x/n",_chunkStat.flags);
if(res==FILTER_E_END_OF_CHUNKS)
endOfChunksCount++;
if(endOfChunksCount>1)
_done=TRUE; //no more chuncks available
}
//get text
if(_currentChunkValid){
ULONG bufLength=(ULONG)(count-charsRead);
if(bufLength<count*2){
bufLength=count*2; //read ahead
}
PWCHAR buffer=new WCHAR[bufLength];
memset(buffer,0,bufLength*2);
res=pFilter->GetText(&bufLength, buffer);
printf("0x%08x/n",res);
if(res==S_OK || res==FILTER_S_LAST_TEXT){
int cRead=(int)bufLength;
if(cRead+charsRead>count){//add the no char tail to _charsLeftFromLastRead after last read.
int charsLeft=(cRead+charsRead-count);
_charsLeftFromLastRead=new WCHAR[charsLeft];
memset(_charsLeftFromLastRead,0,charsLeft*2);
wcsncpy(_charsLeftFromLastRead, buffer+(cRead-charsLeft), charsLeft);
cRead-=charsLeft;
}else{
if(_charsLeftFromLastRead != NULL){
delete[] _charsLeftFromLastRead;
_charsLeftFromLastRead=NULL;
}
}
wcsncpy(array+(offset+charsRead), buffer, cRead);
charsRead+=cRead;
}
if(res==FILTER_S_LAST_TEXT || res==FILTER_E_NO_MORE_TEXT){
_currentChunkValid=FALSE;
}
}
}
char* buffer2 = new char[count*2];
memset(buffer2,0,count*2);
WideCharToMultiByte( CP_ACP, WC_COMPOSITECHECK, array, charsRead, buffer2, charsRead*2, NULL, NULL);
strcat(buffer2,"/n/n");
printf(buffer2);
wprintf(array);
delete[] array;
delete[] buffer2;
pFilter->Release();
CoUninitialize();
return 0;
}
我想相应的过滤dll的作者是将各种类型的文件格式打包封装了,我曾经下载了pdf文件格式的描述文档,足足上千页,你若是看完了,什么都晚了.在此向编写过滤dll的各位大侠致敬.