文本清洗
'''
代码来源https://blog.csdn.net/qq_43814415/article/details/119517978?spm=1001.2101.3001.6650.15&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&utm_relevant_index=16
'''
def clean(line):
"""对一个文件的数据进行清洗"""
rep=['【】','【','】','👍','🤝',
'🐮','🙏','🇨🇳','👏','❤️','………','🐰','...、、',',,','..','💪','🤓',
'⚕️','👩','🙃','😇','🍺','🐂','🙌🏻','😂','📖','😭','✧٩(ˊωˋ*)و✧','🦐','????','//','😊','💰','😜','😯',
'(ღ˘⌣˘ღ)','✧\٩(눈౪눈)و//✧','🌎','🍀','🐴',
'🌻','🌱','🌱','🌻','🙈','(ง•̀_•́)ง!','🉑️','💩',
'🐎','⊙∀⊙!','🙊','【?','+1','😄','🙁','👇🏻','📚','🙇',
'🙋','!!!!','🎉','\(^▽^)/','👌','🆒','🏻',
'🙉','🎵','🎈','🎊','0371-12345','☕️','🌞','😳','👻','🐶','👄','\U0001f92e\U0001f92e','😔','+1','🛀','🐸','🐷','➕1',
'🌚','::','💉','√','x','!!!','🙅','♂️','💊','👋','o(^o^)o','mei\u2006sha\u2006shi','💉','😪','😱',
'🤗','关注','……','(((╹д╹;)))','⚠️','Ծ‸Ծ','⛽️','😓','🐵',
'🙄️','🌕','…','😋','[]','[',']','→_→','💞','😨','"','😁','ฅ۶•ﻌ•♡','😰','🎙️',
'🤧','😫','(ง•̀_•́)ง','😁','✊','🚬','😤','👻','😣',':','😷','(*^▽^)/★*☆','🐁','🐔','😘','🍋','(✪▽✪)','(❁´ω`❁)','1⃣3⃣','(^_^)/','☀️',
'🎁','😅','🌹','🏠','→_→','🙂','✨','❄️','•','🌤','💓','🔨','👏','😏','⊙∀⊙!','👍','✌(̿▀̿\u2009̿Ĺ̯̿̿▀̿̿)✌',
'😊','👆','💤','😘','😊','😴','😉','🌟','♡♪..𝙜𝙤𝙤𝙙𝙣𝙞𝙜𝙝𝙩•͈ᴗ•͈✩‧₊˚','👪','💰','😎','🍀','🛍','🖕🏼','😂','(✪▽✪)','🍋','🍅','👀','♂️','🙋🏻','✌️','🥳',' ̄ ̄)σ',
'😒','😉','🦀','💖','✊','💪','🙄','🎣','🌾','✔️','😡','😌','🔥','❤','🏼','🤭','🌿','丨','✅','🏥','ノ','☀','5⃣⏺1⃣0⃣','🚣','🎣','🤯','🌺',
'🌸',
]
pattern_0=re.compile('#.*?#')#在用户名处匹配话题名称
pattern_1=re.compile('【.*?】')#在用户名处匹配话题名称
pattern_2=re.compile('肺炎@([\u4e00-\u9fa5\w\-]+)')#匹配@
pattern_3=re.compile('@([\u4e00-\u9fa5\w\-]+)')#匹配@
#肺炎@环球时报
pattern_4=re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFF]')#匹配表情
pattern_5=re.compile('(.*?)')#匹配一部分颜文字
pattern_7=re.compile('L.*?的微博视频')
pattern_8=re.compile('(.*?)')
#pattern_9=re.compile(u"\|[\u4e00-\u9fa5]*\|")#匹配中文
line=line.replace('O网页链接','')
line=line.replace('-----','')
line=line.replace('①','')
line=line.replace('②','')
line=line.replace('③','')
line=line.replace('④','')
line=line.replace('>>','')
line=re.sub(pattern_0, '', line,0) #去除话题
line=re.sub(pattern_1, '', line,0) #去除【】
line=re.sub(pattern_2, '', line,0) #去除@
line=re.sub(pattern_3, '', line,0) #去除@
line=re.sub(pattern_4, '', line,0) #去除表情
line=re.sub(pattern_5, '', line,0) #去除一部分颜文字
line=re.sub(pattern_7, '', line,0)
line=re.sub(pattern_8, '', line,0)
line=re.sub(r'\[\S+\]', '', line,0) #去除表情符号
for i in rep:
line=line.replace(i,'')
return line
分词
def seg_sentence(sentence):
sentence = re.sub(u'[0-9\.]+', u'', sentence)
jieba.load_userdict('自建词表.txt')#加载自建词表
#suggest_freq((), tune=True) #修改词频,使其能分出来
#jieba.add_word('知识集成') # 这里是加入用户自定义的词来补充jieba词典
sentence_seged =jieba.cut(sentence.strip(),cut_all=False,use_paddle=10)#默认精确模式
#sentence_seged =jieba.cut_for_search(sentence.strip(),HMM=True)#搜索引擎模式
#keywords =jieba.analyse.extract_tags(sentence, topK=30, withWeight=True, allowPOS=('n', 'v','nr', 'ns'))#关键词模式
#sentence_seged=[item[0] for item in keywords]
stopwords = stopwordslist('停用词表.txt') # 这里加载停用词的路径
synwords=synwordslist('近义词表.txt')#这里加载近义词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords and word.__len__()>1:
if word != '\t':#判断出不是停用词
if word in synwords.keys():#如果是同义词
word = synwords[word]
outstr += word
outstr += " "
else:
outstr += word
outstr += " "
return outstr