NLP数据清洗
Removing URL
def remove_URL(text):##
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'',text)
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"
remove_URL(example)
# Ouput: 'New competition launched :'
Removing HTML tags
def remove_html(text):
html=re.compile(r'<.*?>')
return html.sub(r'',text)
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""
print(remove_html(example))
#Output: Real or Fake
# Kaggle
# getting started
Romoving Emojis
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
remove_emoji("Omg another Earthquake 😔😔")
df['text']=df['text'].apply(lambda x: remove_emoji(x))
#output: Omg another Earthquake
Removing punctuations
def remove_punct(text):
table=str.maketrans('','',string.punctuation)
return text.translate(table)
example="I am a #king"
print(remove_punct(example))
#output: "I am a king"
Spelling Correction
from spellchecker import SpellChecker
spell = SpellChecker()
def correct_spellings(text):
corrected_text = []
misspelled_words = spell.unknown(text.split())
for word in text.split():
if word in misspelled_words:
corrected_text.append(spell.correction(word))
else:
corrected_text.append(word)
return " ".join(corrected_text)
text = "corect me plese"
correct_spellings(text)
#output: "correct me plese"
df['text']=df['text'].apply(lambda x : correct_spellings(x))
最后附上小师姐给的语料预处理过程: