Python实现Flesch阅读易读性公式计算

Shy960418

已于 2024-01-03 09:09:57 修改

阅读量722

点赞数 1

分类专栏：深度学习文章标签： python 开发语言深度学习

于 2023-05-09 16:03:31 首次发布

本文链接：https://blog.csdn.net/m0_37134868/article/details/130582264

版权

深度学习专栏收录该内容

20 篇文章 0 订阅

订阅专栏

计算英文文本的可读性：

# 1.计算单词数
# 2.计算句子数
# 3.计算音节数
# 计算RE值
# RE = 206.835 - （1.015 x ASL） - （84.6 x ASW）
# RE =可读性缓解
# ASL =平均句子长度（即单词数除以句子数）
# ASW =每个单词的平均音节数（即，音节数除以单词数）
import re
import pronouncing


def word_list(filename):
    '''返回单词列表'''
    try:
        with open(filename, 'r', encoding='UTF-8') as f:
            content = f.read()
    except FileNotFoundError:
        errmsg = filename + '文件不存在'
        print(errmsg)
    else:
        word_re = re.compile(r'[^A-Za-z’\']+')
        words = word_re.split(content.lower())
    return words


def sentence_count(filename):
    '''计算句子长度'''
    try:
        with open(filename, 'r', encoding='UTF-8') as f:
            content = f.read()
    except FileNotFoundError:
        errmsg = filename + '文件不存在'
    else:
        point_re = re.compile(r'\.')
        point = point_re.split(content)
        # print('句子长度：' + str(point))
        return (len(point))

def get_pronouncing_num(word):
    '''计算单词音节数'''
    # https://pronouncing.readthedocs.io/en/latest/tutorial.html#counting-syllables
    try:
        pronunciation_list = pronouncing.phones_for_word(word)
        num = pronouncing.syllable_count(pronunciation_list[0])
    except Exception as e:
        print('计算音节数异常:异常单词:"' + word + '"')
        return 1
    else:
        return num


def get_pronouncing_nums(words):
    '''计算文本音节总数'''
    counts = 0
    for word in words:
        counts += get_pronouncing_num(word)
    return counts


# 计算RE值
# RE = 206.835 - （1.015 x ASL） - （84.6 x ASW）
# RE =可读性缓解
# ASL =平均句子长度（即单词数除以句子数）
# ASW =每个单词的平均音节数（即，音节数除以单词数）

if __name__ == '__main__':
    filename = 'detail.txt'
    # 求ASL  单词数/句子数
    word_num = len(word_list(filename))
    sentence_num = sentence_count(filename)
    print(str(word_num) + ',' + str(sentence_num))
    ASL = word_num / sentence_num

    # 求ASW  音节数/单词数     pronouncing_num/word_num
    words = word_list(filename)
    print(len(words))
    pronouncing_nums = get_pronouncing_nums(words)
    ASW = pronouncing_nums / word_num

    # 求RE = 206.835 - （1.015 x ASL） - （84.6 x ASW）
    RE = 206.835 - (1.015 * ASL) - (84.6 * ASW)

    print('ASW:' + str(ASW))
    print('ASL：' + str(ASL))
    print('RE:' + str(RE))

计算中文文本的可读性：

# 计算文本可读性
import re
import jieba
import cntext as ct
import numpy as np
STOPWORDS_zh = ct.load_pkl_dict(file='STOPWORDS.pkl')['STOPWORDS']['chinese']
STOPWORDS_en = ct.load_pkl_dict(file='STOPWORDS.pkl')['STOPWORDS']['english']
ADV_words = ct.load_pkl_dict(file='ADV_CONJ.pkl')['ADV']
CONJ_words = ct.load_pkl_dict(file='ADV_CONJ.pkl')['CONJ']

# 中文分词
def cn_seg_sent(text):
    #split the chinese text into sentences
    text = re.sub('([。！；？;\?])([^”’])', "[[end]]", text)  # 单字符断句符
    text = re.sub('([。！？\?][”’])([^，。！？\?])', "[[end]]", text)
    text = re.sub('\s', '', text)
    # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
    return text.split("[[end]]")

def readability(text, zh_advconj=None, lang='chinese'):
    """
    text readability, the larger the indicator, the higher the complexity of the article and the worse the readability.
    :param text: text string
    :param zh_advconj Chinese conjunctions and adverbs, receive list data type. By default, the built-in dictionary of cntext is used
    :param language: "chinese" or "english"; default is "chinese"
    ------------
    【English readability】english_readability = 4.71 x (characters/words) + 0.5 x (words/sentences) - 21.43；
    【Chinese readability】  Refer 【徐巍,姚振晔,陈冬华.中文年报可读性：衡量与检验[J].会计研究,2021(03):28-44.】
                 readability1  ---每个分句中的平均字数
                 readability2  ---每个句子中副词和连词所占的比例
                 readability3  ---参考Fog Index， readability3=(readability1+readability2)×0.5
                 以上三个指标越大，都说明文本的复杂程度越高，可读性越差。
    """
    if lang=='english':
        text = text.lower()
        #将浮点数、整数替换为num
        text = re.sub('\d+\.\d+|\.\d+', 'num', text)
        num_of_characters = len(text)
        #英文分词
        rgx = re.compile("(?:(?:[^a-zA-Z]+')|(?:'[^a-zA-Z]+))|(?:[^a-zA-Z']+)")
        num_of_words = len(re.split(rgx, text))
        #分句
        num_of_sentences = len(re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text))
        ari = (
                4.71 * (num_of_characters / num_of_words)
                + 0.5 * (num_of_words / num_of_sentences)
                - 21.43
        )
        return {"readability": ari}
    if lang=='chinese':
        if zh_advconj:
            adv_conj_words = zh_advconj
        else:
            adv_conj_words = set(ADV_words + CONJ_words)
        zi_num_per_sent = []
        adv_conj_ratio_per_sent = []
        text = re.sub('\d+\.\d+|\.\d+', 'num', text)
        #【分句】
        sentences = cn_seg_sent(text)
        for sent in sentences:
            adv_conj_num = 0
            zi_num_per_sent.append(len(sent))
            words = list(jieba.cut(sent))
            for w in words:
                if w in adv_conj_words:
                    adv_conj_num+=1
            adv_conj_ratio_per_sent.append(adv_conj_num/(len(words)+1))
        readability1 = np.mean(zi_num_per_sent)
        readability2 = np.mean(adv_conj_ratio_per_sent)
        readability3 = (readability1+readability2)*0.5
        return {'readability1': readability1,
                'readability2': readability2,
                'readability3': readability3}
text1 = "我是个小孩子，我想快快乐乐地成长，慢慢长大。"
text2 = '赵客缦胡缨，吴钩霜雪明。银鞍照白马，飒沓如流星。十步杀一人，千里不留行。事了拂衣去，深藏身与名。闲过信陵饮，脱剑膝前横。将炙啖朱亥，持觞劝侯嬴。三杯吐然诺，五岳倒为轻。眼花耳热后，意气素霓生。救赵挥金槌，邯郸先震惊。千秋二壮士，烜赫大梁城。纵死侠骨香，不惭世上英。谁能书阁下，白首太玄经。'
print(readability(text1,lang='chinese'))
print(readability(text2,lang='chinese'))