使用OCR识别出来的字符为utf-8编码的,如果直接用C/C++中的cout或printf打印,中文会显示乱码的,因此要先将utf-8编码转换为C/C++中的ANSI编码,然后再用cout或printf打印即可。
#include<leptonica\allheaders.h>
#include<tesseract\baseapi.h>
#include<iostream>
#include<time.h>
using namespace std;
wchar_t *utf_8ToUnicode(char *u8s)
{
int wcsLen = MultiByteToWideChar(CP_UTF8, NULL, u8s, strlen(u8s), NULL, NULL);
wchar_t *wcString = new wchar_t[wcsLen + 1];
MultiByteToWideChar(CP_UTF8, NULL, u8s, strlen(u8s), wcString, wcsLen);
wcString[wcsLen] = '\0';
return wcString;
}
char *unicodeToAnsi(wchar_t *wcString)
{
int len = WideCharToMultiByte(CP_ACP, NULL, wcString, -1, NULL, NULL, NULL, NULL);
char *str = new char[len];
WideCharToMultiByte(CP_ACP, NULL, wcString, -1, str, len, NULL, NULL);
return str;
}
int main()
{
clock_t start = clock();
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI;
/*参数1为存放traineddata文件的文件夹tessdata的路径,如果已经在环境变量那里设置好了,
那可以把它设置为NULL;参数2为字库文件名,chi_sim代表简体中文,chi_tra代表繁体
中文,eng代表英文*/
if (api->Init(NULL, "chi_sim"))
exit(1);
Pix *image = pixRead("2.png");
api->SetImage(image);
char *outText = api->GetUTF8Text();
wchar_t *aaa = utf_8ToUnicode(outText);
outText = unicodeToAnsi(aaa);
clock_t end = clock();
cout << "OCR output:\n" << outText;
cout << end - start << " ms" << endl;
delete outText;
delete api;
return 0;
}