文本清洗及分词

最新推荐文章于 2024-04-04 11:10:07 发布

照崴

最新推荐文章于 2024-04-04 11:10:07 发布

阅读量412

点赞数

文章标签： python 开发语言

本文链接：https://blog.csdn.net/weixin_45152740/article/details/127973235

版权

文本清洗

'''
代码来源https://blog.csdn.net/qq_43814415/article/details/119517978?spm=1001.2101.3001.6650.15&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&utm_relevant_index=16
'''
def clean(line):
    """对一个文件的数据进行清洗"""
    rep=['【】','【','】','👍','🤝',
        '🐮','🙏','🇨🇳','👏','❤️','………','🐰','...、、','，，','..','💪','🤓',
         '⚕️','👩','🙃','😇','🍺','🐂','🙌🏻','😂','📖','😭','✧٩(ˊωˋ*)و✧','🦐','？？？？','//','😊','💰','😜','😯',
         '(ღ˘⌣˘ღ)','✧＼٩(눈౪눈)و/／✧','🌎','🍀','🐴',
         '🌻','🌱','🌱','🌻','🙈','(ง•̀_•́)ง！','🉑️','💩',
         '🐎','⊙∀⊙！','🙊','【？','+1','😄','🙁','👇🏻','📚','🙇',
         '🙋','！！！！','🎉','＼(^▽^)／','👌','🆒','🏻',
         '🙉','🎵','🎈','🎊','0371-12345','☕️','🌞','😳','👻','🐶','👄','\U0001f92e\U0001f92e','😔','＋1','🛀','🐸','🐷','➕1',
         '🌚','：：','💉','√','x','！！！','🙅','♂️','💊','👋','o(^o^)o','mei\u2006sha\u2006shi','💉','😪','😱',
         '🤗','关注','……','(((╹д╹;)))','⚠️','Ծ‸Ծ','⛽️','😓','🐵',
         '🙄️','🌕','…','😋','[]','[',']','→_→','💞','😨','&quot;','😁','ฅ۶•ﻌ•♡','😰','🎙️',
         '🤧','😫','(ง•̀_•́)ง','😁','✊','🚬','😤','👻','😣','：','😷','(*^▽^)/★*☆','🐁','🐔','😘','🍋','(✪▽✪)','(❁´ω`❁)','1⃣3⃣','(^_^)／','☀️',
	     '🎁','😅','🌹','🏠','→_→','🙂','✨','❄️','•','🌤','💓','🔨','👏','😏','⊙∀⊙！','👍','✌(̿▀̿\u2009̿Ĺ̯̿̿▀̿̿)✌',
         '😊','👆','💤','😘','😊','😴','😉','🌟','♡♪..𝙜𝙤𝙤𝙙𝙣𝙞𝙜𝙝𝙩•͈ᴗ•͈✩‧₊˚','👪','💰','😎','🍀','🛍','🖕🏼','😂','(✪▽✪)','🍋','🍅','👀','♂️','🙋🏻','✌️','🥳','￣￣)σ',
         '😒','😉','🦀','💖','✊','💪','🙄','🎣','🌾','✔️','😡','😌','🔥','❤','🏼','🤭','🌿','丨','✅','🏥','ﾉ','☀','5⃣⏺1⃣0⃣','🚣','🎣','🤯','🌺',
         '🌸',
         ]
    pattern_0=re.compile('#.*?#')#在用户名处匹配话题名称
    pattern_1=re.compile('【.*?】')#在用户名处匹配话题名称
    pattern_2=re.compile('肺炎@([\u4e00-\u9fa5\w\-]+)')#匹配@
    pattern_3=re.compile('@([\u4e00-\u9fa5\w\-]+)')#匹配@
    #肺炎@环球时报
    pattern_4=re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFF]')#匹配表情
    pattern_5=re.compile('(.*?)')#匹配一部分颜文字
    pattern_7=re.compile('L.*?的微博视频')
    pattern_8=re.compile('（.*?）')
    #pattern_9=re.compile(u"\|[\u4e00-\u9fa5]*\|")#匹配中文

    line=line.replace('O网页链接','')
    line=line.replace('-----','')
    line=line.replace('①','')
    line=line.replace('②','')
    line=line.replace('③','')
    line=line.replace('④','')
    line=line.replace('>>','')
    line=re.sub(pattern_0, '', line,0) #去除话题
    line=re.sub(pattern_1, '', line,0) #去除【】
    line=re.sub(pattern_2, '', line,0) #去除@
    line=re.sub(pattern_3, '', line,0) #去除@
    line=re.sub(pattern_4, '', line,0) #去除表情
    line=re.sub(pattern_5, '', line,0) #去除一部分颜文字
    line=re.sub(pattern_7, '', line,0) 
    line=re.sub(pattern_8, '', line,0) 
    line=re.sub(r'\[\S+\]', '', line,0) #去除表情符号
    
    for i in rep:
        line=line.replace(i,'')
    return line

分词

def seg_sentence(sentence):
    sentence = re.sub(u'[0-9\.]+', u'', sentence)
    jieba.load_userdict('自建词表.txt')#加载自建词表
    #suggest_freq((), tune=True) #修改词频，使其能分出来
    #jieba.add_word('知识集成')		# 这里是加入用户自定义的词来补充jieba词典
    sentence_seged =jieba.cut(sentence.strip(),cut_all=False,use_paddle=10)#默认精确模式
    #sentence_seged =jieba.cut_for_search(sentence.strip(),HMM=True)#搜索引擎模式
    #keywords =jieba.analyse.extract_tags(sentence, topK=30, withWeight=True, allowPOS=('n', 'v','nr', 'ns'))#关键词模式
    #sentence_seged=[item[0] for item in keywords]
    stopwords = stopwordslist('停用词表.txt')  # 这里加载停用词的路径
    synwords=synwordslist('近义词表.txt')#这里加载近义词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords and word.__len__()>1:
            if word != '\t':#判断出不是停用词
                if word in synwords.keys():#如果是同义词
                    word = synwords[word]
                    outstr += word
                    outstr += " "    
                else:
                    outstr += word
                    outstr += " "
    return outstr