def clean_text(text):
text = text.lower() # lowercase
text = re.sub(r'[!]+', '!', text)
text = re.sub(r'[?]+', '?', text)
text = re.sub(r'[.]+', '.', text)
text = re.sub(r"'", "", text)
text = re.sub('\s+', ' ', text).strip() # Remove and double spaces
text = re.sub(r'&?', r'and', text) # replace & -> and
text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text) # Remove URLs
# remove some puncts (except . ! # ?)
text = re.sub(r'[:"$%&\*+,-/:;<=>@\\^_`{|}~]+', '', text)
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'EMOJI', text)
return text
清理文本
最新推荐文章于 2024-02-28 20:51:32 发布