pyparsing库解析中文

最新推荐文章于 2024-02-21 16:16:23 发布

go_forever_happy

最新推荐文章于 2024-02-21 16:16:23 发布

阅读量386

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/go_forever_happy/article/details/128615726

版权

Python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

支持非ascii编码字符解析，如中文等。

class unicode_set(object):
    """
    A set of Unicode characters, for language-specific strings for
    ``alphas``, ``nums``, ``alphanums``, and ``printables``.
    A unicode_set is defined by a list of ranges in the Unicode character
    set, in a class attribute ``_ranges``, such as::

        _ranges = [(0x0020, 0x007e), (0x00a0, 0x00ff),]

    A unicode set can also be defined using multiple inheritance of other unicode sets::

        class CJK(Chinese, Japanese, Korean):
            pass
    """
    _ranges = []
    @classmethod
    def _get_chars_for_ranges(cls):
        ret = []
        for cc in cls.__mro__:
            if cc is unicode_set:
                break
            for rr in cc._ranges:
                ret.extend(range(rr[0], rr[-1] + 1))
        return [unichr(c) for c in sorted(set(ret))]

    @_lazyclassproperty
    def printables(cls):
        "all non-whitespace characters in this range"
        return u''.join(filterfalse(unicode.isspace, cls._get_chars_for_ranges()))

    @_lazyclassproperty
    def alphas(cls):
        "all alphabetic characters in this range"
        return u''.join(filter(unicode.isalpha, cls._get_chars_for_ranges()))

    @_lazyclassproperty
    def nums(cls):
        "all numeric digit characters in this range"
        return u''.join(filter(unicode.isdigit, cls._get_chars_for_ranges()))

    @_lazyclassproperty
    def alphanums(cls):
        "all alphanumeric characters in this range"
        return cls.alphas + cls.nums
class pyparsing_unicode(unicode_set):
    """
    A namespace class for defining common language unicode_sets.
    """
    _ranges = [(32, sys.maxunicode)]

    .........
# 实例此类来设定pyparsing模块unicode字符
    class Chinese(unicode_set): 
        "Unicode set for Chinese Unicode Character Range"
        _ranges = [(0x4e00, 0x9fff), (0x3000, 0x303f),]

使用

import pyparsing as pp
# 设定pypasing模块unicode字符集
pp.pyparsing_unicode.Chinese()
tt = pp.Word(pp.pyparsing_unicode.alphanums)
print(tt.parseString("你好ss11"))  # >> ['你好ss11']