Python实现Flesch阅读易读性公式计算

计算英文文本的可读性:

# 1.计算单词数
# 2.计算句子数
# 3.计算音节数
# 计算RE值
# RE = 206.835 - (1.015 x ASL) - (84.6 x ASW)
# RE =可读性缓解
# ASL =平均句子长度(即单词数除以句子数)
# ASW =每个单词的平均音节数(即,音节数除以单词数)
import re
import pronouncing


def word_list(filename):
    '''返回单词列表'''
    try:
        with open(filename, 'r', encoding='UTF-8') as f:
            content = f.read()
    except FileNotFoundError:
        errmsg = filename + '文件不存在'
        print(errmsg)
    else:
        word_re = re.compile(r'[^A-Za-z’\']+')
        words = word_re.split(content.lower())
    return words


def sentence_count(filename):
    '''计算句子长度'''
    try:
        with open(filename, 'r', encoding='UTF-8') as f:
            content = f.read()
    except FileNotFoundError:
        errmsg = filename + '文件不存在'
    else:
        point_re = re.compile(r'\.')
        point = point_re.split(content)
        # print('句子长度:' + str(point))
        return (len(point))

def get_pronouncing_num(word):
    '''计算单词音节数'''
    # https://pronouncing.readthedocs.io/en/latest/tutorial.html#counting-syllables
    try:
        pronunciation_list = pronouncing.phones_for_word(word)
        num = pronouncing.syllable_count(pronunciation_list[0])
    except Exception as e:
        print('计算音节数异常:异常单词:"' + word + '"')
        return 1
    else:
        return num


def get_pronouncing_nums(words):
    '''计算文本音节总数'''
    counts = 0
    for word in words:
        counts += get_pronouncing_num(word)
    return counts


# 计算RE值
# RE = 206.835 - (1.015 x ASL) - (84.6 x ASW)
# RE =可读性缓解
# ASL =平均句子长度(即单词数除以句子数)
# ASW =每个单词的平均音节数(即,音节数除以单词数)

if __name__ == '__main__':
    filename = 'detail.txt'
    # 求ASL  单词数/句子数
    word_num = len(word_list(filename))
    sentence_num = sentence_count(filename)
    print(str(word_num) + ',' + str(sentence_num))
    ASL = word_num / sentence_num

    # 求ASW  音节数/单词数     pronouncing_num/word_num
    words = word_list(filename)
    print(len(words))
    pronouncing_nums = get_pronouncing_nums(words)
    ASW = pronouncing_nums / word_num

    # 求RE = 206.835 - (1.015 x ASL) - (84.6 x ASW)
    RE = 206.835 - (1.015 * ASL) - (84.6 * ASW)

    print('ASW:' + str(ASW))
    print('ASL:' + str(ASL))
    print('RE:' + str(RE))


计算中文文本的可读性:

# 计算文本可读性
import re
import jieba
import cntext as ct
import numpy as np
STOPWORDS_zh = ct.load_pkl_dict(file='STOPWORDS.pkl')['STOPWORDS']['chinese']
STOPWORDS_en = ct.load_pkl_dict(file='STOPWORDS.pkl')['STOPWORDS']['english']
ADV_words = ct.load_pkl_dict(file='ADV_CONJ.pkl')['ADV']
CONJ_words = ct.load_pkl_dict(file='ADV_CONJ.pkl')['CONJ']

# 中文分词
def cn_seg_sent(text):
    #split the chinese text into sentences
    text = re.sub('([。!;?;\?])([^”’])', "[[end]]", text)  # 单字符断句符
    text = re.sub('([。!?\?][”’])([^,。!?\?])', "[[end]]", text)
    text = re.sub('\s', '', text)
    # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
    return text.split("[[end]]")

def readability(text, zh_advconj=None, lang='chinese'):
    """
    text readability, the larger the indicator, the higher the complexity of the article and the worse the readability.
    :param text: text string
    :param zh_advconj Chinese conjunctions and adverbs, receive list data type. By default, the built-in dictionary of cntext is used
    :param language: "chinese" or "english"; default is "chinese"
    ------------
    【English readability】english_readability = 4.71 x (characters/words) + 0.5 x (words/sentences) - 21.43;
    【Chinese readability】  Refer 【徐巍,姚振晔,陈冬华.中文年报可读性:衡量与检验[J].会计研究,2021(03):28-44.】
                 readability1  ---每个分句中的平均字数
                 readability2  ---每个句子中副词和连词所占的比例
                 readability3  ---参考Fog Index, readability3=(readability1+readability2)×0.5
                 以上三个指标越大,都说明文本的复杂程度越高,可读性越差。
    """
    if lang=='english':
        text = text.lower()
        #将浮点数、整数替换为num
        text = re.sub('\d+\.\d+|\.\d+', 'num', text)
        num_of_characters = len(text)
        #英文分词
        rgx = re.compile("(?:(?:[^a-zA-Z]+')|(?:'[^a-zA-Z]+))|(?:[^a-zA-Z']+)")
        num_of_words = len(re.split(rgx, text))
        #分句
        num_of_sentences = len(re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text))
        ari = (
                4.71 * (num_of_characters / num_of_words)
                + 0.5 * (num_of_words / num_of_sentences)
                - 21.43
        )
        return {"readability": ari}
    if lang=='chinese':
        if zh_advconj:
            adv_conj_words = zh_advconj
        else:
            adv_conj_words = set(ADV_words + CONJ_words)
        zi_num_per_sent = []
        adv_conj_ratio_per_sent = []
        text = re.sub('\d+\.\d+|\.\d+', 'num', text)
        #【分句】
        sentences = cn_seg_sent(text)
        for sent in sentences:
            adv_conj_num = 0
            zi_num_per_sent.append(len(sent))
            words = list(jieba.cut(sent))
            for w in words:
                if w in adv_conj_words:
                    adv_conj_num+=1
            adv_conj_ratio_per_sent.append(adv_conj_num/(len(words)+1))
        readability1 = np.mean(zi_num_per_sent)
        readability2 = np.mean(adv_conj_ratio_per_sent)
        readability3 = (readability1+readability2)*0.5
        return {'readability1': readability1,
                'readability2': readability2,
                'readability3': readability3}
text1 = "我是个小孩子,我想快快乐乐地成长,慢慢长大。"
text2 = '赵客缦胡缨,吴钩霜雪明。银鞍照白马,飒沓如流星。十步杀一人,千里不留行。事了拂衣去,深藏身与名。闲过信陵饮,脱剑膝前横。将炙啖朱亥,持觞劝侯嬴。三杯吐然诺,五岳倒为轻。眼花耳热后,意气素霓生。救赵挥金槌,邯郸先震惊。千秋二壮士,烜赫大梁城。纵死侠骨香,不惭世上英。谁能书阁下,白首太玄经。'
print(readability(text1,lang='chinese'))
print(readability(text2,lang='chinese'))              

来源:

  • https://blog.csdn.net/granery/article/details/88912059
  • https://mp.weixin.qq.com/s/kgqRavPtoUq3ZPLrpooSrA
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值