效果图
pdfium.dll的使用
//#include"stdafx.h"
//#include "Transcoding.h"
//#if defined(_MSC_VER) && (_MSC_VER > 1000)
//#include <afxwin.h>
//#include <afxdisp.h>
//#else
//#include <dlfcn.h>
//#endif
#include<tchar.h>
#include "GetPdfText.h"
#include <sstream>
#include <vector>
#include<iostream>
int DecodeCharUTF16(const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End/*=NULL*/)
{
// Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) UTF-16 code points
// pszUTF16End can be NULL if pszUTF16 is zero terminated
int nUChar = *pwszUTF16;
++pwszUTF16;
if ((nUChar & ~0x000007ff) == 0xd800) // W1
{
if (pwszUTF16 == pszUTF16End || !(*pwszUTF16)) // W2
return -1; // incorrect UTF-16
nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000;
++pwszUTF16;
}
return nUChar;
}
void EncodeCharUTF8(int nUChar, char* pszUTF8, int& nUTF8Len)
{
// Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len
// Be sure pszUTF8 has room for up to 4 bytes
if (!(nUChar & ~0x0000007f)) // < 0x80
{
if (pszUTF8)
pszUTF8[nUTF8Len++] = (char)nUChar;
else
++nUTF8Len;
}
else if (!(nUChar & ~0x000007ff)) // < 0x800
{
if (pszUTF8)
{
pszUTF8[nUTF8Len++] = (char)(((nUChar & 0x7c0) >> 6) | 0xc0);
pszUTF8[nUTF8Len++] = (char)((nUChar & 0x3f) | 0x80);
}
else
nUTF8Len += 2;
}
else if (!(nUChar & ~0x0000ffff)) // < 0x10000
{
if (pszUTF8)
{
pszUTF8[nUTF8Len++] = (char)(((nUChar & 0xf000) >> 12) | 0xe0);
pszUTF8[nUTF8Len++] = (char)(((nUChar & 0xfc0) >> 6) | 0x80);
pszUTF8[nUTF8Len++] = (char)((nUChar & 0x3f) | 0x80);
}
else
nUTF8Len += 3;
}
else // < 0x110000
{
if (pszUTF8)
{
pszUTF8[nUTF8Len++] = (char)(((nUChar & 0x1c0000) >> 18) | 0xf0);
pszUTF8[nUTF8Len++] = (char)(((nUChar & 0x3f000) >> 12) | 0x80);
pszUTF8[nUTF8Len++] = (char)(((nUChar & 0xfc0) >> 6) | 0x80);
pszUTF8[nUTF8Len++] = (char)((nUChar & 0x3f) | 0x80);
}
else
nUTF8Len += 4;
}
}
int UTF16To8(char* pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count)
{
// Supports the same arguments as wcstombs
// the pwszUTF16 source must be a NULL-terminated UTF-16 string
// if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored
// otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows
// nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired
// and the number of bytes (excluding NULL) is returned
//
int nUChar, nUTF8Len = 0;
while (*pwszUTF16)
{
// Decode UTF-16
nUChar = DecodeCharUTF16(pwszUTF16, NULL);
if (nUChar == -1)
nUChar = '?';
// Encode UTF-8
if (pszUTF8 && nUTF8Len + 4 > nUTF8Count)
{
int nUTF8LenSoFar = nUTF8Len;
EncodeCharUTF8(nUChar, NULL, nUTF8Len);
if (nUTF8Len > nUTF8Count)
return nUTF8LenSoFar;
nUTF8Len = nUTF8LenSoFar;
}
EncodeCharUTF8(nUChar, pszUTF8, nUTF8Len);
}
if (pszUTF8 && nUTF8Len < nUTF8Count)
pszUTF8[nUTF8Len] = 0;
return nUTF8Len;
}
int WCharToUTF8Char(char* pszDest, const wchar_t* pwszSrc, int nCharLen)
{
int nWSLenth = wcslen(pwszSrc);
unsigned short* pwszUTF16 = new unsigned short[nWSLenth + 1];
for (int i = 0; i < nWSLenth; i++)
pwszUTF16[i] = pwszSrc[i];
pwszUTF16[nWSLenth] = 0;
int nResultLen = UTF16To8(pszDest, pwszUTF16, nCharLen);
delete[] pwszUTF16;
return nResultLen;
}
const std::string ws2s(const std::wstring& ws)
{
const wchar_t* src_wstr = ws.c_str();
size_t buffer_size = ws.size() * 4 + 1;
char* dst_str = new char[buffer_size];
WCharToUTF8Char(dst_str, src_wstr, buffer_size);
std::string result = dst_str;
delete[]dst_str;
return result;
}
int DecodeCharUTF8(const char*& pszUTF8, const char* pszUTF8End/*=NULL*/)
{
// Return Unicode code point and increment pszUTF8 past 1-4 bytes
// pszUTF8End can be NULL if pszUTF8 is null terminated
int nUChar = (unsigned char)*pszUTF8;
++pszUTF8;
if (nUChar & 0x80)
{
int nExtraChars;
if (!(nUChar & 0x20))
{
nExtraChars = 1;
nUChar &= 0x1f;
}
else if (!(nUChar & 0x10))
{
nExtraChars = 2;
nUChar &= 0x0f;
}
else if (!(nUChar & 0x08))
{
nExtraChars = 3;
nUChar &= 0x07;
}
else
return -1;
while (nExtraChars--)
{
if (pszUTF8 == pszUTF8End || !(*pszUTF8 & 0x80))
return -1;
nUChar = nUChar << 6;
nUChar |= *pszUTF8 & 0x3f;
++pszUTF8;
}
}
return nUChar;
}
void EncodeCharUTF16(int nUChar, unsigned short* pwszUTF16, int& nUTF16Len)
{
// Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nUTF16Len
// Be sure pwszUTF16 has room for up to 2 wide chars
if (nUChar & ~0xffff)
{
if (pwszUTF16)
{
// Surrogate pair
nUChar -= 0x10000;
pwszUTF16[nUTF16Len++] = (unsigned short)(((nUChar >> 10) & 0x3ff) | 0xd800); // W1
pwszUTF16[nUTF16Len++] = (unsigned short)((nUChar & 0x3ff) | 0xdc00); // W2
}
else
nUTF16Len += 2;
}
else
{
if (pwszUTF16)
pwszUTF16[nUTF16Len++] = (unsigned short)nUChar;
else
++nUTF16Len;
}
}
int UTF8To16(unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count)
{
// Supports the same arguments as mbstowcs
// the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count
// if pwszUTF16 is NULL, the number of UTF-16 chars required is returned
// nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result
// if pwszUTF16 is not NULL it is filled with the result string and it must be large enough
// result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count
// and the number of UTF-8 bytes converted is returned
//
const char* pszPosUTF8 = pszUTF8;
const char* pszUTF8End = pszUTF8 + nUTF8Count;
int nUChar, nUTF8Len = 0, nUTF16Len = 0;
while (pszPosUTF8 != pszUTF8End)
{
nUChar = DecodeCharUTF8(pszPosUTF8, pszUTF8End);
if (!nUChar)
{
if (pwszUTF16)
pwszUTF16[nUTF16Len] = 0;
break;
}
else if (nUChar == -1)
nUChar = '?';
// Encode UTF-16
EncodeCharUTF16(nUChar, pwszUTF16, nUTF16Len);
}
nUTF8Len = (int)(pszPosUTF8 - pszUTF8);
if (!pwszUTF16)
return nUTF16Len;
return nUTF8Len;
}
int UTF8CharToWChar(wchar_t* pwszDest, const char* pszSrc, int nWcharLen)
{
// std::locale old_loc =
// std::locale::global(std::locale(""));
//const size_t buffer_size = strlen(pszSrc)) + 1;
const size_t buffer_size = (strlen(pszSrc)) * 2 + 1;//20180711Yang
// int nNum = buffer_size;
// printf("utf-8 :buffer_size %d", nNum);
unsigned short * pwszUTF16 = new unsigned short[buffer_size];
memset(pwszUTF16, 0, sizeof(unsigned short)*buffer_size);
int nWLenResult = UTF8To16(pwszUTF16, pszSrc, buffer_size);
if (nWcharLen < nWLenResult)
{
delete[]pwszUTF16;
return nWLenResult;
}
wmemset(pwszDest, 0, nWcharLen);
for (int i = 0; i < nWLenResult; i++)
pwszDest[i] = pwszUTF16[i];
delete[]pwszUTF16;
return 0;
}
std::string UtfToString(std::string strValue)
{
int nwLen = ::MultiByteToWideChar(CP_ACP, 0, strValue.c_str(), -1, NULL, 0);
wchar_t * pwBuf = new wchar_t[nwLen + 1];//加上末尾'\0'
ZeroMemory(pwBuf, nwLen * 2 + 2);
::MultiByteToWideChar(CP_ACP, 0, strValue.c_str(), strValue.length(), pwBuf, nwLen);
int nLen = ::WideCharToMultiByte(CP_UTF8, 0, pwBuf, -1, NULL, NULL, NULL, NULL);
char * pBuf = new char[nLen + 1];
ZeroMemory(pBuf, nLen + 1);
::WideCharToMultiByte(CP_UTF8, 0, pwBuf, nwLen, pBuf, nLen, NULL, NULL);
std::string retStr(pBuf);
delete[]pwBuf;
delete[]pBuf;
pwBuf = NULL;
pBuf = NULL;
return retStr;
}
std::string UtfToGbk(std::string strValue)
{
int len = MultiByteToWideChar(CP_UTF8, 0, strValue.c_str(), -1, NULL, 0);
wchar_t* wstr = new wchar_t[len + 1];
memset(wstr, 0, len + 1);
MultiByteToWideChar(CP_UTF8, 0, strValue.c_str(), -1, wstr, len);
len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
char* str = new char[len + 1];
memset(str, 0, len + 1);
WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
if (wstr) delete[] wstr;
return std::string(str);
}
int main()
{
//wchar_t *dll_path
//#if defined(_MSC_VER) && (_MSC_VER > 1000)
// CString strPath = dll_path;
// strPath += _T("pdfium.dll");
// g_hin = LoadLibraryW(strPath.GetBuffer());
// if(g_hin == NULL)
// {
// return false;
// }
//
wchar_t *dll_path = _T("pdfium.dll");
HMODULE g_hin = LoadLibraryW(dll_path);
if (g_hin == NULL)
{
int err = GetLastError();
return err;
}
pFPDF_InitLibrary m_FPDF_InitLibrary = (pFPDF_InitLibrary)GetProcAddress(g_hin, "FPDF_InitLibrary");
pFPDF_DestroyLibrary m_FPDF_DestroyLibrary = (pFPDF_DestroyLibrary)GetProcAddress(g_hin, "FPDF_DestroyLibrary");
pFPDF_GetDocPermissions m_FPDF_GetDocPermissions = (pFPDF_GetDocPermissions)GetProcAddress(g_hin, "FPDF_GetDocPermissions");
pFPDF_GetPageCount m_FPDF_GetPageCount = (pFPDF_GetPageCount)GetProcAddress(g_hin, "FPDF_GetPageCount");
pFPDF_LoadPage m_FPDF_LoadPage = (pFPDF_LoadPage)GetProcAddress(g_hin, "FPDF_LoadPage");
pFPDF_ClosePage m_FPDF_ClosePage = (pFPDF_ClosePage)GetProcAddress(g_hin, "FPDF_ClosePage");
pFPDF_CloseDocument m_FPDF_CloseDocument = (pFPDF_CloseDocument)GetProcAddress(g_hin, "FPDF_CloseDocument");
pFPDF_LoadDocument m_FPDF_LoadDocument = (pFPDF_LoadDocument)GetProcAddress(g_hin, "FPDF_LoadDocument");
pFPDFText_LoadPage m_FPDFText_LoadPage = (pFPDFText_LoadPage)GetProcAddress(g_hin, "FPDFText_LoadPage");
pFPDFText_ClosePage m_FPDFText_ClosePage = (pFPDFText_ClosePage)GetProcAddress(g_hin, "FPDFText_ClosePage");
pFPDFText_CountChars m_FPDFText_CountChars = (pFPDFText_CountChars)GetProcAddress(g_hin, "FPDFText_CountChars");
pFPDFText_CountRects m_FPDFText_CountRects = (pFPDFText_CountRects)GetProcAddress(g_hin, "FPDFText_CountRects");
pFPDFText_GetRect m_FPDFText_GetRect = (pFPDFText_GetRect)GetProcAddress(g_hin, "FPDFText_GetRect");
pFPDFText_GetBoundedText m_FPDFText_GetBoundedText = (pFPDFText_GetBoundedText)GetProcAddress(g_hin, "FPDFText_GetBoundedText");
pFPDF_GetLastError m_FPDF_GetLastError = (pFPDF_GetLastError)GetProcAddress(g_hin, "FPDF_GetLastError");
if (m_FPDF_InitLibrary == NULL || m_FPDF_DestroyLibrary == NULL || m_FPDF_GetDocPermissions == NULL || m_FPDF_GetPageCount == NULL
|| m_FPDF_LoadPage == NULL || m_FPDF_ClosePage == NULL || m_FPDF_CloseDocument == NULL || m_FPDF_LoadDocument == NULL || m_FPDFText_LoadPage == NULL
|| m_FPDFText_ClosePage == NULL || m_FPDFText_CountChars == NULL || m_FPDFText_CountRects == NULL || m_FPDFText_GetRect == NULL
|| m_FPDFText_GetBoundedText == NULL || m_FPDF_GetLastError == NULL)
{
return 1;
}
m_FPDF_InitLibrary(NULL);
FPDF_DOCUMENT docs = NULL;
int page_count = 0;
std::vector<TextRectInfo*> pTextRectInfo;
docs = m_FPDF_LoadDocument("123.pdf", NULL);
if (docs == NULL)
{
m_FPDF_DestroyLibrary();
return 1;
}
(void)m_FPDF_GetDocPermissions(docs);
page_count = m_FPDF_GetPageCount(docs);
std::vector<wchar_t*>res;
std::vector<TextRectInfo*>results;
for (int i = 0; i < page_count; ++i)
{
FPDF_PAGE page = NULL;
FPDF_TEXTPAGE text_page = NULL;
int CharCounts;
int NumRects;
page = m_FPDF_LoadPage(docs, i);
if (page == NULL)
{
int err = m_FPDF_GetLastError();
return err;
}
text_page = m_FPDFText_LoadPage(page);
CharCounts = m_FPDFText_CountChars(text_page);
NumRects = m_FPDFText_CountRects(text_page, 0, CharCounts);
if (NumRects <= 0)
{
return 1;
}
for (int j = 0; j < NumRects; j++)
{
double left = 0;
double top = 0;
double right = 0;
double bottom = 0;
m_FPDFText_GetRect(text_page, j, &left, &top, &right, &bottom);
unsigned short resultRect[2048];
memset(resultRect, 0, 2048);
int buflen = 2048;
int num = m_FPDFText_GetBoundedText(text_page, left, top, right, bottom, resultRect, buflen);
TextRectInfo *textrectinfo = new TextRectInfo;
memset(textrectinfo->TextInfo, 0, 1024);
textrectinfo->TextInfoLen = num - 1;
textrectinfo->rc.left = left;
textrectinfo->rc.top = top;
textrectinfo->rc.right = right;
textrectinfo->rc.bottom = bottom;
//textrectinfo->TextInfo = resultRect;
char szstr[2048];
memset(szstr, 0, sizeof(char) * 2048);
UTF16To8(szstr, resultRect, 2048);
//printf("line182 :%s , %d, \n", szstr, strlen(szstr));
UTF8CharToWChar(textrectinfo->TextInfo, szstr, 1024);
/*std::string filed_value = ws2s(textrectinfo->TextInfo);
std::cout << UtfToGbk(filed_value) << std::endl;*/
res.push_back(textrectinfo->TextInfo);
pTextRectInfo.push_back(textrectinfo);
results.push_back(textrectinfo);
}
m_FPDFText_ClosePage(text_page);
m_FPDF_ClosePage(page);
}
int loc = 0;
std::string tmps="";
TextRectInfo *tmp_textinfo = results[0];
loc = tmp_textinfo->rc.bottom;
int half = abs(tmp_textinfo->rc.bottom - tmp_textinfo->rc.top);
for (auto i = 0; i < results.size()-1; i++)
{
TextRectInfo *tmp_textinfo = results[i];
/*std::string filed_value = ws2s(tmp_textinfo->TextInfo);
std::cout << UtfToGbk(filed_value) << std::endl;*/
if (abs(loc - int(tmp_textinfo->rc.bottom))<max(half,abs(tmp_textinfo->rc.bottom- tmp_textinfo->rc.top)))
{
//std::cout << loc << std::endl;
std::string filed_value = ws2s(tmp_textinfo->TextInfo);
tmps += filed_value;
loc = tmp_textinfo->rc.bottom;
}
else
{
std::cout << UtfToGbk(tmps) << std::endl;
tmps.clear();
loc = tmp_textinfo->rc.bottom;
//std::cout << loc << std::endl;
std::string filed_value = ws2s(tmp_textinfo->TextInfo);
tmps += filed_value;
}
}
pFPDF_CloseDocument(doc);
pFPDF_DestroyLibrary();
std::cout << "ok" << std::endl;
system("pause");
}