福昕解析pdf库解析pdf代码记录

效果图
在这里插入图片描述
pdfium.dll的使用

//#include"stdafx.h"
//#include "Transcoding.h"
//#if defined(_MSC_VER) && (_MSC_VER > 1000)
//#include <afxwin.h>
//#include <afxdisp.h>
//#else
//#include <dlfcn.h>
//#endif

#include<tchar.h>
#include "GetPdfText.h"
#include <sstream>
#include <vector>
#include<iostream>
int DecodeCharUTF16(const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End/*=NULL*/)
{
	// Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) UTF-16 code points
	// pszUTF16End can be NULL if pszUTF16 is zero terminated
	int nUChar = *pwszUTF16;
	++pwszUTF16;
	if ((nUChar & ~0x000007ff) == 0xd800) // W1
	{
		if (pwszUTF16 == pszUTF16End || !(*pwszUTF16)) // W2
			return -1; // incorrect UTF-16
		nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000;
		++pwszUTF16;
	}
	return nUChar;
}

void EncodeCharUTF8(int nUChar, char* pszUTF8, int& nUTF8Len)
{
	// Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len
	// Be sure pszUTF8 has room for up to 4 bytes
	if (!(nUChar & ~0x0000007f)) // < 0x80
	{
		if (pszUTF8)
			pszUTF8[nUTF8Len++] = (char)nUChar;
		else
			++nUTF8Len;
	}
	else if (!(nUChar & ~0x000007ff)) // < 0x800
	{
		if (pszUTF8)
		{
			pszUTF8[nUTF8Len++] = (char)(((nUChar & 0x7c0) >> 6) | 0xc0);
			pszUTF8[nUTF8Len++] = (char)((nUChar & 0x3f) | 0x80);
		}
		else
			nUTF8Len += 2;
	}
	else if (!(nUChar & ~0x0000ffff)) // < 0x10000
	{
		if (pszUTF8)
		{
			pszUTF8[nUTF8Len++] = (char)(((nUChar & 0xf000) >> 12) | 0xe0);
			pszUTF8[nUTF8Len++] = (char)(((nUChar & 0xfc0) >> 6) | 0x80);
			pszUTF8[nUTF8Len++] = (char)((nUChar & 0x3f) | 0x80);
		}
		else
			nUTF8Len += 3;
	}
	else // < 0x110000
	{
		if (pszUTF8)
		{
			pszUTF8[nUTF8Len++] = (char)(((nUChar & 0x1c0000) >> 18) | 0xf0);
			pszUTF8[nUTF8Len++] = (char)(((nUChar & 0x3f000) >> 12) | 0x80);
			pszUTF8[nUTF8Len++] = (char)(((nUChar & 0xfc0) >> 6) | 0x80);
			pszUTF8[nUTF8Len++] = (char)((nUChar & 0x3f) | 0x80);
		}
		else
			nUTF8Len += 4;
	}
}

int UTF16To8(char* pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count)
{
	// Supports the same arguments as wcstombs
	// the pwszUTF16 source must be a NULL-terminated UTF-16 string
	// if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored
	// otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows
	// nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired
	// and the number of bytes (excluding NULL) is returned
	//
	int nUChar, nUTF8Len = 0;
	while (*pwszUTF16)
	{
		// Decode UTF-16
		nUChar = DecodeCharUTF16(pwszUTF16, NULL);
		if (nUChar == -1)
			nUChar = '?';

		// Encode UTF-8
		if (pszUTF8 && nUTF8Len + 4 > nUTF8Count)
		{
			int nUTF8LenSoFar = nUTF8Len;
			EncodeCharUTF8(nUChar, NULL, nUTF8Len);
			if (nUTF8Len > nUTF8Count)
				return nUTF8LenSoFar;
			nUTF8Len = nUTF8LenSoFar;
		}
		EncodeCharUTF8(nUChar, pszUTF8, nUTF8Len);
	}
	if (pszUTF8 && nUTF8Len < nUTF8Count)
		pszUTF8[nUTF8Len] = 0;
	return nUTF8Len;
}
int WCharToUTF8Char(char* pszDest, const wchar_t* pwszSrc, int nCharLen)
{
	int nWSLenth = wcslen(pwszSrc);
	unsigned short* pwszUTF16 = new unsigned short[nWSLenth + 1];
	for (int i = 0; i < nWSLenth; i++)
		pwszUTF16[i] = pwszSrc[i];
	pwszUTF16[nWSLenth] = 0;
	int nResultLen = UTF16To8(pszDest, pwszUTF16, nCharLen);
	delete[] pwszUTF16;
	return nResultLen;
}
const std::string ws2s(const std::wstring& ws)
{
	const wchar_t* src_wstr = ws.c_str();
	size_t buffer_size = ws.size() * 4 + 1;
	char* dst_str = new char[buffer_size];
	WCharToUTF8Char(dst_str, src_wstr, buffer_size);
	std::string result = dst_str;
	delete[]dst_str;
	return result;
}
int DecodeCharUTF8(const char*& pszUTF8, const char* pszUTF8End/*=NULL*/)
{
	// Return Unicode code point and increment pszUTF8 past 1-4 bytes
	// pszUTF8End can be NULL if pszUTF8 is null terminated
	int nUChar = (unsigned char)*pszUTF8;
	++pszUTF8;
	if (nUChar & 0x80)
	{
		int nExtraChars;
		if (!(nUChar & 0x20))
		{
			nExtraChars = 1;
			nUChar &= 0x1f;
		}
		else if (!(nUChar & 0x10))
		{
			nExtraChars = 2;
			nUChar &= 0x0f;
		}
		else if (!(nUChar & 0x08))
		{
			nExtraChars = 3;
			nUChar &= 0x07;
		}
		else
			return -1;
		while (nExtraChars--)
		{
			if (pszUTF8 == pszUTF8End || !(*pszUTF8 & 0x80))
				return -1;
			nUChar = nUChar << 6;
			nUChar |= *pszUTF8 & 0x3f;
			++pszUTF8;
		}
	}
	return nUChar;
}
void EncodeCharUTF16(int nUChar, unsigned short* pwszUTF16, int& nUTF16Len)
{
	// Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nUTF16Len
	// Be sure pwszUTF16 has room for up to 2 wide chars
	if (nUChar & ~0xffff)
	{
		if (pwszUTF16)
		{
			// Surrogate pair
			nUChar -= 0x10000;
			pwszUTF16[nUTF16Len++] = (unsigned short)(((nUChar >> 10) & 0x3ff) | 0xd800); // W1
			pwszUTF16[nUTF16Len++] = (unsigned short)((nUChar & 0x3ff) | 0xdc00); // W2
		}
		else
			nUTF16Len += 2;
	}
	else
	{
		if (pwszUTF16)
			pwszUTF16[nUTF16Len++] = (unsigned short)nUChar;
		else
			++nUTF16Len;
	}
}
int UTF8To16(unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count)
{
	// Supports the same arguments as mbstowcs
	// the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count
	// if pwszUTF16 is NULL, the number of UTF-16 chars required is returned
	// nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result
	// if pwszUTF16 is not NULL it is filled with the result string and it must be large enough
	// result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count
	// and the number of UTF-8 bytes converted is returned
	//
	const char* pszPosUTF8 = pszUTF8;
	const char* pszUTF8End = pszUTF8 + nUTF8Count;
	int nUChar, nUTF8Len = 0, nUTF16Len = 0;
	while (pszPosUTF8 != pszUTF8End)
	{
		nUChar = DecodeCharUTF8(pszPosUTF8, pszUTF8End);
		if (!nUChar)
		{
			if (pwszUTF16)
				pwszUTF16[nUTF16Len] = 0;
			break;
		}
		else if (nUChar == -1)
			nUChar = '?';

		// Encode UTF-16
		EncodeCharUTF16(nUChar, pwszUTF16, nUTF16Len);
	}
	nUTF8Len = (int)(pszPosUTF8 - pszUTF8);
	if (!pwszUTF16)
		return nUTF16Len;
	return nUTF8Len;
}
int UTF8CharToWChar(wchar_t* pwszDest, const char* pszSrc, int nWcharLen)
{
	// 	std::locale old_loc =
	// 		std::locale::global(std::locale(""));
	//const size_t buffer_size = strlen(pszSrc)) + 1;
	const size_t buffer_size = (strlen(pszSrc)) * 2 + 1;//20180711Yang
														// 	int nNum = buffer_size;
														// 	printf("utf-8 :buffer_size %d", nNum);
	unsigned short * pwszUTF16 = new unsigned short[buffer_size];
	memset(pwszUTF16, 0, sizeof(unsigned short)*buffer_size);
	int nWLenResult = UTF8To16(pwszUTF16, pszSrc, buffer_size);
	if (nWcharLen < nWLenResult)
	{
		delete[]pwszUTF16;
		return nWLenResult;
	}
	wmemset(pwszDest, 0, nWcharLen);
	for (int i = 0; i < nWLenResult; i++)
		pwszDest[i] = pwszUTF16[i];
	delete[]pwszUTF16;
	return 0;
}
std::string UtfToString(std::string strValue)

{

	int nwLen = ::MultiByteToWideChar(CP_ACP, 0, strValue.c_str(), -1, NULL, 0);
	wchar_t * pwBuf = new wchar_t[nwLen + 1];//加上末尾'\0'
	ZeroMemory(pwBuf, nwLen * 2 + 2);
	::MultiByteToWideChar(CP_ACP, 0, strValue.c_str(), strValue.length(), pwBuf, nwLen);
	int nLen = ::WideCharToMultiByte(CP_UTF8, 0, pwBuf, -1, NULL, NULL, NULL, NULL);
	char * pBuf = new char[nLen + 1];
	ZeroMemory(pBuf, nLen + 1);
	::WideCharToMultiByte(CP_UTF8, 0, pwBuf, nwLen, pBuf, nLen, NULL, NULL);
	std::string retStr(pBuf);
	delete[]pwBuf;
	delete[]pBuf;
	pwBuf = NULL;
	pBuf = NULL;
	return retStr;
}
std::string UtfToGbk(std::string strValue)
{
	int len = MultiByteToWideChar(CP_UTF8, 0, strValue.c_str(), -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len + 1];
	memset(wstr, 0, len + 1);
	MultiByteToWideChar(CP_UTF8, 0, strValue.c_str(), -1, wstr, len);
	len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
	char* str = new char[len + 1];
	memset(str, 0, len + 1);
	WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
	if (wstr) delete[] wstr;
	return std::string(str);
}

int main()
{
	//wchar_t *dll_path
	//#if defined(_MSC_VER) && (_MSC_VER > 1000)
	//    CString strPath = dll_path;
	//	strPath += _T("pdfium.dll");
	//	g_hin = LoadLibraryW(strPath.GetBuffer());
	//	if(g_hin == NULL)
	//	{
	//		return false;
	//	}
	//
	wchar_t *dll_path = _T("pdfium.dll");
	HMODULE g_hin = LoadLibraryW(dll_path);
	if (g_hin == NULL)
	{   
		int err = GetLastError();

		return err;
	}
	pFPDF_InitLibrary m_FPDF_InitLibrary = (pFPDF_InitLibrary)GetProcAddress(g_hin, "FPDF_InitLibrary");
	pFPDF_DestroyLibrary m_FPDF_DestroyLibrary = (pFPDF_DestroyLibrary)GetProcAddress(g_hin, "FPDF_DestroyLibrary");
	pFPDF_GetDocPermissions m_FPDF_GetDocPermissions = (pFPDF_GetDocPermissions)GetProcAddress(g_hin, "FPDF_GetDocPermissions");
	pFPDF_GetPageCount m_FPDF_GetPageCount = (pFPDF_GetPageCount)GetProcAddress(g_hin, "FPDF_GetPageCount");
	pFPDF_LoadPage m_FPDF_LoadPage = (pFPDF_LoadPage)GetProcAddress(g_hin, "FPDF_LoadPage");
	pFPDF_ClosePage m_FPDF_ClosePage = (pFPDF_ClosePage)GetProcAddress(g_hin, "FPDF_ClosePage");
	pFPDF_CloseDocument m_FPDF_CloseDocument = (pFPDF_CloseDocument)GetProcAddress(g_hin, "FPDF_CloseDocument");
	pFPDF_LoadDocument m_FPDF_LoadDocument = (pFPDF_LoadDocument)GetProcAddress(g_hin, "FPDF_LoadDocument");
	pFPDFText_LoadPage m_FPDFText_LoadPage = (pFPDFText_LoadPage)GetProcAddress(g_hin, "FPDFText_LoadPage");
	pFPDFText_ClosePage m_FPDFText_ClosePage = (pFPDFText_ClosePage)GetProcAddress(g_hin, "FPDFText_ClosePage");
	pFPDFText_CountChars m_FPDFText_CountChars = (pFPDFText_CountChars)GetProcAddress(g_hin, "FPDFText_CountChars");
	pFPDFText_CountRects m_FPDFText_CountRects = (pFPDFText_CountRects)GetProcAddress(g_hin, "FPDFText_CountRects");
	pFPDFText_GetRect m_FPDFText_GetRect = (pFPDFText_GetRect)GetProcAddress(g_hin, "FPDFText_GetRect");
	pFPDFText_GetBoundedText m_FPDFText_GetBoundedText = (pFPDFText_GetBoundedText)GetProcAddress(g_hin, "FPDFText_GetBoundedText");
	pFPDF_GetLastError m_FPDF_GetLastError = (pFPDF_GetLastError)GetProcAddress(g_hin, "FPDF_GetLastError");
	if (m_FPDF_InitLibrary == NULL || m_FPDF_DestroyLibrary == NULL || m_FPDF_GetDocPermissions == NULL || m_FPDF_GetPageCount == NULL
		|| m_FPDF_LoadPage == NULL || m_FPDF_ClosePage == NULL || m_FPDF_CloseDocument == NULL || m_FPDF_LoadDocument == NULL || m_FPDFText_LoadPage == NULL
		|| m_FPDFText_ClosePage == NULL || m_FPDFText_CountChars == NULL || m_FPDFText_CountRects == NULL || m_FPDFText_GetRect == NULL
		|| m_FPDFText_GetBoundedText == NULL || m_FPDF_GetLastError == NULL)
	{
		return 1;
	}
	
	m_FPDF_InitLibrary(NULL);
	FPDF_DOCUMENT docs = NULL;
	int page_count = 0;
	std::vector<TextRectInfo*> pTextRectInfo;
	docs = m_FPDF_LoadDocument("123.pdf", NULL);
	if (docs == NULL)
	{
		m_FPDF_DestroyLibrary();
		return 1;
	}
	(void)m_FPDF_GetDocPermissions(docs);
	page_count = m_FPDF_GetPageCount(docs);
	std::vector<wchar_t*>res;
	std::vector<TextRectInfo*>results;
	for (int i = 0; i < page_count; ++i)
	{
		FPDF_PAGE page = NULL;
		FPDF_TEXTPAGE text_page = NULL;
		int CharCounts;
		int NumRects;
		page = m_FPDF_LoadPage(docs, i);
		if (page == NULL)
		{
			int err = m_FPDF_GetLastError();
			return err;
		}
		text_page = m_FPDFText_LoadPage(page);
		CharCounts = m_FPDFText_CountChars(text_page);
		NumRects = m_FPDFText_CountRects(text_page, 0, CharCounts);
		if (NumRects <= 0)
		{
			return 1;
		}
		for (int j = 0; j < NumRects; j++)
		{
			double left = 0;
			double top = 0;
			double right = 0;
			double bottom = 0;
			m_FPDFText_GetRect(text_page, j, &left, &top, &right, &bottom);
			unsigned short  resultRect[2048];
			memset(resultRect, 0, 2048);
			int buflen = 2048;
			int num = m_FPDFText_GetBoundedText(text_page, left, top, right, bottom, resultRect, buflen);
			TextRectInfo *textrectinfo = new TextRectInfo;
			memset(textrectinfo->TextInfo, 0, 1024);
			textrectinfo->TextInfoLen = num - 1;
			textrectinfo->rc.left = left;
			textrectinfo->rc.top = top;
			textrectinfo->rc.right = right;
			textrectinfo->rc.bottom = bottom;
			//textrectinfo->TextInfo = resultRect;
			
			char szstr[2048];
			memset(szstr, 0, sizeof(char) * 2048);
			UTF16To8(szstr, resultRect, 2048);
			//printf("line182 :%s , %d, \n", szstr, strlen(szstr));
			UTF8CharToWChar(textrectinfo->TextInfo, szstr, 1024);
			/*std::string filed_value = ws2s(textrectinfo->TextInfo);
			std::cout << UtfToGbk(filed_value) << std::endl;*/
			res.push_back(textrectinfo->TextInfo);
			pTextRectInfo.push_back(textrectinfo);
			results.push_back(textrectinfo);

		}

		m_FPDFText_ClosePage(text_page);
		m_FPDF_ClosePage(page);
	}
    int loc = 0;
	std::string tmps="";
	TextRectInfo *tmp_textinfo = results[0];
	loc = tmp_textinfo->rc.bottom;
	int half = abs(tmp_textinfo->rc.bottom - tmp_textinfo->rc.top);
	for (auto i = 0; i < results.size()-1; i++)
	{
		TextRectInfo *tmp_textinfo = results[i];
		/*std::string filed_value = ws2s(tmp_textinfo->TextInfo);
		std::cout << UtfToGbk(filed_value) << std::endl;*/
		if (abs(loc - int(tmp_textinfo->rc.bottom))<max(half,abs(tmp_textinfo->rc.bottom- tmp_textinfo->rc.top)))
		{
			//std::cout << loc << std::endl;
			std::string filed_value = ws2s(tmp_textinfo->TextInfo);
			tmps += filed_value;
			loc = tmp_textinfo->rc.bottom;
		}
		else
		{
			std::cout << UtfToGbk(tmps) << std::endl;
			tmps.clear();
			loc = tmp_textinfo->rc.bottom;
			//std::cout << loc << std::endl;
			std::string filed_value = ws2s(tmp_textinfo->TextInfo);
			tmps += filed_value;
		}
	}

	pFPDF_CloseDocument(doc);

	pFPDF_DestroyLibrary();
	std::cout << "ok" << std::endl;
	system("pause");
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值