代码实例:
1、导入库和文件
import numpy as np
import pandas as pd
import re
from gensim import corpora,models,similarities
from nltk.corpus import stopwords
df = pd.read_csv('H:/HillaryEmails.csv')
df = df[['Id','ExtractedBodyText']].dropna()
2、文本处理
'''
文本预处理
'''
def clean_email_text(text):
text = text.replace(&#