ICU Bidi和Script

最新推荐文章于 2021-05-26 03:59:25 发布

weixin_33836874

最新推荐文章于 2021-05-26 03:59:25 发布

阅读量333

点赞数

原文链接：https://my.oschina.net/wolfcs/blog/117497

版权

有些语言下，人家平时写字的时候都是自右向左书写的，不同于我们中文，或者英文这种自左向右书写的文字。单一方向的文字在显示的时候处理起来一般都会比较简单，而如果自左向右的字和自右向左的字混合在一起，处理起来就会比较麻烦一点。这种混杂有自左向右的字和自右向左的字的字串，称之为Bidrectional text。

通常文本渲染引擎处理这种Bidirectional text的方法为：首先将整个字串拆分成几个小的子串，在这些小的子串中，则每一个字都具有相同的方向属性，然后会针对这每一个小的子串，再来做处理。比较流行的用来做Bidi处理的API就是ICU的这组API了。这组API的用法大体就像下面的例子所演示的那样：

UChar testChars[] = {
        0x0020, 0x0946, 0x0939, 0x093F, 0x0928, 0x094D, 0x0926, 0x0940, 0x0020,
        0x0627, 0x0644, 0x0639, 0x0631, 0x0628, 0x064A, 0x0629, 0x0020,
        0x0420, 0x0443, 0x0441, 0x0441, 0x043A, 0x0438, 0x0439, 0x0020,
        'E', 'n', 'g', 'l', 'i', 's', 'h',  0x0020,
        0x6F22, 0x5B75, 0x3068, 0x3072, 0x3089, 0x304C, 0x306A, 0x3068,
        0x30AB, 0x30BF, 0x30AB, 0x30CA,
        0xD801, 0xDC00, 0xD801, 0xDC01, 0xD801, 0xDC02, 0xD801, 0xDC03
    };

    UBiDi* bidi = ubidi_open();
    UBiDiLevel bidiReq = UBIDI_DEFAULT_LTR;
    int stringLen = sizeof (testChars) / sizeof (testChars[0]);
    if (bidi) {
        UErrorCode status = U_ZERO_ERROR;
        ubidi_setPara(bidi, testChars, stringLen, bidiReq, NULL, &status);
        if (U_SUCCESS(status)) {
            int paraDir = ubidi_getParaLevel(bidi);
            ssize_t rc = ubidi_countRuns(bidi, &status);

            for (size_t i = 0; i < size_t(rc); ++i) {
                int32_t startRun = -1;
                int32_t lengthRun = -1;
                UBiDiDirection runDir = ubidi_getVisualRun(bidi, i, &startRun, &lengthRun);
                bool isRTL = (runDir == UBIDI_RTL);
                printf ("Processing Bidi Run = %d -- run-start = %d, run-len = %d, isRTL = %d\n",
                                                    i, startRun, lengthRun, isRTL);

                ScriptRun scriptRun(testChars, startRun, lengthRun);
                while (scriptRun.next()) {
                    int32_t     start = scriptRun.getScriptStart();
                    int32_t     end   = scriptRun.getScriptEnd();
                    UScriptCode code  = scriptRun.getScriptCode();

                    printf("Script '%s' from %d to %d.\n", uscript_getName(code), start, end);
                }
            }
        }
    }

可以看到，这组API的主要用法为：

调用ubidi_open()创建一个UBiDi结构->调用ubidi_setPara()来做Bidi，这个函数的名称，似乎小有一点误导性，实际上这个函数返回的时候，ICU bidi引擎是已经完成了整个Bidi处理的过程了的->可以调用ubidi_getParaLevel(bidi)来获取整个子串的方向及ubidi_countRuns(bidi, &status)来获取这个子串中方向相同的小子串的数目->调用ubidi_getVisualRun(bidi, i, &startRun, &lengthRun)以可视的顺序逐个的获取每一个小子串的开始位置和长度。

对于文本渲染而言，将子串切分成不同方向的小子串，才是完成了第一步。这些小子串，实际上还不能拿来，就直接去获取每个字的Glyph ID。字库文件里面通常会存放一个Unicode范围的字，这个范围，通常就是一个语系，或者称为是script。因而对于每一个Bidi 子串，还需要再把它们切分成一个个的script 子串。

在ICU的source code中有为我们提供一些有用的示例，在icu4c/source/extra/scriptrun/下。可以看到ScriptRun这个class。这个class可以帮助我们将一个子串切分成几个不同的script run。

由这个class的实现，可以看到它的一个缺陷，就是没有办法正确的处理自右向左方向的Bidi run。假如从第9到第17个字为阿拉伯语字符，第17到第25个字为希伯来语字符，那个从第9个字到第25个字将会被划分在一个Bidi的Visual run里面。在画的时候，应该是会要先画希伯来语的字符，后画阿拉伯语的字符才对。ScriptRun这个class实际上无法处理这种case。还是可以先看一下，上面那段code执行的结果：

Processing Bidi Run = 0 -- run-start = 0, run-len = 9, isRTL = 0
Script 'Devanagari' from 0 to 9.
Processing Bidi Run = 1 -- run-start = 9, run-len = 7, isRTL = 1
Script 'Arabic' from 9 to 16.
Processing Bidi Run = 2 -- run-start = 16, run-len = 37, isRTL = 0
Script 'Cyrillic' from 16 to 25.
Script 'Latin' from 25 to 33.
Script 'Han' from 33 to 35.
Script 'Hiragana' from 35 to 41.
Script 'Katakana' from 41 to 45.
Script 'Deseret' from 45 to 53.

问题总是能够被解决的嘛。没有办法正确的处理自右向左方向的Bidi run的script的问题，可以通过改变对于script run的information的访问方式来处理。大体的思路即是，首先将一个Bidi run的所有script的信息先按顺序收集起来，然后在依据这个Bidi run的方向，来决定是从最后一个script run到第一个script run这种方式来访问，还是从第一个到最后一个这种顺序来访问。收集bidi run中的所有script run的方式大体可以像下面这样：

void collectBidiScriptRuns (BidiScriptRunRecords &scriptRunRecords,
        const UChar *chars, int32_t start, int32_t end, bool isRTL) {
    scriptRunRecords.isRtl = isRTL;

    ScriptRun scriptRun(chars, start, end);
    while (scriptRun.next()) {
        ScriptRecord scriptRecord;
        scriptRecord.startChar = scriptRun.getScriptStart();
        scriptRecord.endChar = scriptRun.getScriptEnd();
        scriptRecord.scriptCode = scriptRun.getScriptCode();

        scriptRunRecords.records.push_back(scriptRecord);
    }
}

顺便再来看一下，上面那段code中用到的几个struct：

#include <deque>
using std::deque;

typedef struct ScriptRecord
{
    UChar32 startChar;
    UChar32 endChar;
    UScriptCode scriptCode;
} ScriptRecord;

typedef struct BidiScriptRunRecords {
    bool isRtl;
    deque<ScriptRecord> records;
} BidiScriptRunRecords;

访问script run information的方式的变更，可以像下面这样：

UChar testChars[] = {
        0x0020, 0x0946, 0x0939, 0x093F, 0x0928, 0x094D, 0x0926, 0x0940, 0x0020,
        0x0627, 0x0644, 0x0639, 0x0631, 0x0628, 0x064A, 0x0629, 0x0020,
        0x059A, 0x05B3, 0x05D2, 0x05E6, 0x05F2, 0x05DC, 0x05E9, 0x05F3,
        0x0420, 0x0443, 0x0441, 0x0441, 0x043A, 0x0438, 0x0439, 0x0020,
        'E', 'n', 'g', 'l', 'i', 's', 'h',  0x0020,
        0x6F22, 0x5B75, 0x3068, 0x3072, 0x3089, 0x304C, 0x306A, 0x3068,
        0x30AB, 0x30BF, 0x30AB, 0x30CA,
        0xD801, 0xDC00, 0xD801, 0xDC01, 0xD801, 0xDC02, 0xD801, 0xDC03
    };

    UBiDi* bidi = ubidi_open();
    UBiDiLevel bidiReq = UBIDI_DEFAULT_LTR;
    int stringLen = sizeof (testChars) / sizeof (testChars[0]);
    if (bidi) {
        UErrorCode status = U_ZERO_ERROR;
        ubidi_setPara(bidi, testChars, stringLen, bidiReq, NULL, &status);
        if (U_SUCCESS(status)) {
            int paraDir = ubidi_getParaLevel(bidi);
            ssize_t rc = ubidi_countRuns(bidi, &status);

            for (size_t i = 0; i < size_t(rc); ++i) {
                int32_t startRun = -1;
                int32_t lengthRun = -1;
                UBiDiDirection runDir = ubidi_getVisualRun(bidi, i, &startRun, &lengthRun);
                bool isRTL = (runDir == UBIDI_RTL);
                printf ("Processing Bidi Run = %d -- run-start = %d, run-len = %d, isRTL = %d\n",
                                                    i, startRun, lengthRun, isRTL);

                BidiScriptRunRecords scriptRunRecords;
                collectBidiScriptRuns(scriptRunRecords, testChars, startRun, lengthRun, isRTL);
                while (!scriptRunRecords.records.empty()) {
                    ScriptRecord scriptRecord;
                    if (scriptRunRecords.isRtl) {
                        scriptRecord = scriptRunRecords.records.back();
                        scriptRunRecords.records.pop_back();
                    } else {
                        scriptRecord = scriptRunRecords.records.front();
                        scriptRunRecords.records.pop_front();
                    }

                    int32_t     start = scriptRecord.startChar;
                    int32_t     end = scriptRecord.endChar;
                    UScriptCode code = scriptRecord.scriptCode;
                    printf("Script '%s' from %d to %d.\n", uscript_getName(code), start, end);
                }
            }
        }
    }

对于上面的那段测试字串，输出如下：

Processing Bidi Run = 0 -- run-start = 0, run-len = 9, isRTL = 0
Script 'Devanagari' from 0 to 9.
Processing Bidi Run = 1 -- run-start = 9, run-len = 16, isRTL = 1
Script 'Hebrew' from 17 to 25.
Script 'Arabic' from 9 to 17.
Processing Bidi Run = 2 -- run-start = 25, run-len = 36, isRTL = 0
Script 'Cyrillic' from 25 to 33.
Script 'Latin' from 33 to 41.
Script 'Han' from 41 to 43.
Script 'Hiragana' from 43 to 49.
Script 'Katakana' from 49 to 53.
Script 'Deseret' from 53 to 61.

可以看到，有正确处理阿拉伯语和希伯来语放在一起的case了。

End

转载于:https://my.oschina.net/wolfcs/blog/117497