需要读取pdx的内容,主要是文本部分,本来以为比较简单,还是折腾了一下,特作记录。
关于vs2017集成mupdf,这个直接百度关键词就ok了,网上介绍的是0.17.0版本的mupdf,所以就用了这个。
然后搜索读取文本内容,很多都是python的,相关函数找不到,c++很少资料,还得是外网的示例代码里翻到,特此记录供大家取用:
std::string readPDF(std::string filePath,int pageNum)
{
std::string ret;
//使用默认值
int pageCount = 0;
int pdfPageIdx = 0;
float zoom = 100;
float rotate = 0;
fz_context *ctx;
fz_document *doc;
fz_pixmap *pix;
fz_text * ptxt;
fz_page *page;
fz_device *dev;
int x, y;
/* Create a context to hold the exception stack and various caches. */
ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
if (!ctx)
{
fprintf(stderr, "cannot create mupdf context\n");
return false;
}
/* Register the default file types to handle. */
fz_try(ctx)
{
fz_register_document_handlers(ctx);
}
fz_catch(ctx)
{
fprintf(stderr, "cannot register document handlers: %s\n", fz_caught_message(ctx));
fz_drop_context(ctx);
return false;
}
/* Open the document. */
fz_try(ctx)
{
doc = fz_open_document(ctx, filePath.c_str());
}
fz_catch(ctx)
{
fprintf(stderr, "cannot open document: %s\n", fz_caught_message(ctx));
fz_drop_context(ctx);
return false;
}
/* Count the number of pages. */
fz_try(ctx)
{
pageCount = fz_count_pages(ctx, doc);
}
fz_catch(ctx)
{
fprintf(stderr, "cannot count number of pages: %s\n", fz_caught_message(ctx));
fz_drop_document(ctx, doc);
fz_drop_context(ctx);
return ret;
}
int target_page = pageNum;
fz_stext_page *text_page = fz_new_stext_page_from_page_number(ctx, doc, target_page, 0);
std::string r;
for (fz_stext_block* text_block = text_page->first_block;
text_block != nullptr; text_block = text_block->next) {
if (text_block->type != FZ_STEXT_BLOCK_TEXT) {
continue;
}
for (fz_stext_line* text_line = text_block->u.t.first_line;
text_line != nullptr; text_line = text_line->next) {
for (fz_stext_char* text_char = text_line->first_char;
text_char != nullptr; text_char = text_char->next) {
{
const int c = text_char->c;
// A single UTF-8 character cannot take more than 4 bytes, but let's
// go for 8.
char buffer[8];
const int num_bytes = fz_runetochar(buffer, c);
if (num_bytes > 1)
{
continue;
}
assert(num_bytes <= static_cast<int>(sizeof(buffer)));
buffer[num_bytes] = '\0';
r += buffer;
}
}
if (!isspace(r.back())) {
//r += int(0);
}
}
}
ret = r;
/* Clean up. */
fz_drop_stext_page(ctx, text_page);
fz_drop_document(ctx, doc);
fz_drop_context(ctx);
return ret;
}