PPT提取文字C代码实现

本文介绍了一种使用C语言从PPT文件中提取文本的方法,提供了包含头文件file_read.h和实现文件file_read.cpp的代码实现。
摘要由CSDN通过智能技术生成

以下是C语言实现提取PPT文本的代码

头文件 file_read.h


#include "stdafx.h"

//Appendix B
// Sample code to read the text out of a PowerPoint '97 presentation.

#include <ole2.h>
#include <stdio.h>
//#include <time.h>

// Stolen from app\sertypes.h
// system dependent sizse
// system dependent sizes

typedef signed long sint4; // signed 4-byte integral value
typedef signed short sint2; // signed 4-byte integral value
typedef unsigned long uint4; // unsigned 4-byte integral value
typedef unsigned short uint2; // 2-byte
typedef char bool1; // 1-byte boolean
typedef unsigned char ubyte1; // unsigned byte value
typedef uint2 psrType;
typedef uint4 psrSize; // each record is preceeded by 

// pssTypeType and pssSizeType.
typedef uint2 psrInstance;
typedef uint2 psrVersion;
typedef uint4 psrReference; // Saved object reference

#define PSFLAG_CONTAINER 0xFF // If the version field of a record
// header takes on this value, the
// record header marks the start of
// a container.
// PowerPoint97 Record Header
typedef unsigned long DWord;


enum DrawMode { PT_DRAW_TEXT = 0, PT_DRAW_HTML = 1};

//输出字符串回调函数
typedef void (*PostText)(WCHAR* buffer, int len);


struct RecordHeader
{
	psrVersion recVer : 4; // may be PSFLAG_CONTAINER
	psrInstance recInstance : 12; 
	psrType recType;
	psrSize recLen;
};

struct PSR_CurrentUserAtom
{
	uint4 size;
	uint4 magic; // Magic number to ensure this is a PowerPoint file.
	uint4 offsetToCurrentEdit; // Offset in main stream to current edit field.
	uint2 lenUserName;
	uint2 docFileVersion;
	ubyte1 majorVersion;
	ubyte1 minorVersion;
};


struct PSR_UserEditAtom
{
	sint4 lastSlideID; // slideID
	uint4 version; // This is major/minor/build which did the edit
	uint4 offsetLastEdit; // File offset of last edit
	uint4 offsetPersistDirectory; // Offset to PersistPtrs for 
	// this file version.

	uint4 documentRef;
	uint4 maxPersistWritten; // Addr of last persist ref written to the file (max seen so far).
	sint2 lastViewType; // enum view type
};


struct PSR_SlidePersistAtom
{
	uint4 psrReference;
	uint4 flags;
	sint4 numberTexts;
	sint4 slideId;
	uint4 reserved;
};


#define CURRENT_USER_STREAM L"Current User"
#define DOCUMENT_STREAM L"PowerPoint Document"
#define HEADER_MAGIC_NUM -476987297


const int PST_UserEditAtom = 4085;
const int PST_PersistPtrIncrementalBlock = 6002; // Incremental diffs on persists
const int PST_SlidePersistAtom = 1011;
const int PST_TextCharsAtom = 4000; // Unicode in text
const int PST_TextBytesAtom = 4008; // non-unicode text


class PPSPersistDirectory;


struct ParseContext
{
	ParseContext(ParseContext *pNext) : m_pNext(pNext), m_nCur(0) {}
	RecordHeader m_rh;
	uint4 m_nCur;
	ParseContext *m_pNext;
};


const int SLIDELISTCHUNKSIZE = 32;


struct SlideListChunk
{
	SlideListChunk( SlideListChunk* next, psrReference newOne ) : pNext( next ), numInChunk(1) 
	{ 
		refs[0] = newOne; 
	}
	SlideListChunk *pNext;
	DWord numInChunk;
	psrReference refs[SLIDELISTCHUNKSIZE];
};


class FileReader
{
public:
	FileReader(IStorage *pStg, PostText postTextEvent, DrawMode drawTextMode); //
	~FileReader();
	
	//按文本或者HTML格式提取
	DrawMode m_drawMode;

	BOOL ReadText( WCHAR *pBuff, ULONG size, ULONG *pSizeRet );
	// Reads next size chars from file. Returns TRUE if there is more
	// text to read.
	BOOL IsPowerPoint() { return m_isPP; } // Returns true if this is a PowerPoint '97 file.

	void ReadPersistDirectory();
	void PPSReadUserEditAtom( DWord offset, PSR_UserEditAtom& userEdit );
	void ReadSlideList();

protected:
	BOOL ReadCurrentUser(IStream *pStm);
	void *ReadRecord( RecordHeader& rh );

	BOOL Parse();
	IStream *GetDoc
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值