1、代码
def clean_text(text, remove_stopwords=False): """ 数据清洗 """ text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return words def to_review_vector(review): """ 获取词向量 """ global word_vec review = clean_text(review, remove_stopwords=True) #print (review) #words = nltk.word_tokenize(review) word_vec = np.zeros((1,300)) for word in review: #word_vec = np.zeros((1,300)) if word in model: word_vec += np.array([model[word]]) #print (word_vec.mean(axis = 0)) return pd.Series(word_vec.mean(axis = 0))