1.re.sub(pattern,repl,string,count=0)
letters_only= re.sub("[^a-zA-Z]", " ", review_text)将文本中非字母的字符替换为空格
pattern,表示正则中的模式字符串repl,就是replacement,被替换,的字符串的意思。repl可以是字符串,也可以是函数
string,即表示要被处理,要被替换的那个string字符串。
Count:指定执行replce的字符串的个数,默认为0,即对所有符合条件的都执行。
2. stopwords.words("english"),加载英文的停用词
3. CountVectorizer将文本文档集合转换为token计数矩阵,可参考我的另一篇笔记。
4. 此处粗略地使用了随机森林来训练模型并预测,同时没有使用到unlabeledTrainData,效果不佳。
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
#清洗数据
def review_to_words(raw_review):
# Function to convert a raw review to a string of words
# The input is a single string (a raw movie review), and
# the output is a single string (a preprocessed movie review)
#
# 1. Remove HTML
review_text = BeautifulSoup(raw_review).get_text()
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("english"))
#
# 5. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return " ".join(meaningful_words)
#加载以标注训练集
def load_train_data():
train = pd.read_csv("dataSet/labeledTrainData.tsv", header=0, \
delimiter="\t", quoting=3)
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
for i in range(num_reviews ):
# Call our function for each one, and add the result to the list of
# clean reviews
clean_train_reviews.append( review_to_words( train["review"][i] ) )
return train,np.array(clean_train_reviews), np.array(train['sentiment'])
#加载未标注训练集
def load_unlabeled_train_data():
unlabeled_train = pd.read_csv("dataSet/unlabeledTrainData.tsv", header=0, \
delimiter="\t", quoting=3)
# Get the number of reviews based on the dataframe column size
num_reviews = unlabeled_train["review"].size
# Initialize an empty list to hold the clean reviews
clean_unlabeled_train_reviews = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
for i in range(num_reviews ):
# Call our function for each one, and add the result to the list of
# clean reviews
clean_unlabeled_train_reviews.append( review_to_words(unlabeled_train["review"][i]))
return unlabeled_train,np.array(clean_unlabeled_train_reviews)
#加载测试集
def load_test_data():
test = pd.read_csv("dataSet/testData.tsv", header=0, \
delimiter="\t", quoting=3)
# Get the number of reviews based on the dataframe column size
num_reviews = test["review"].size
# Initialize an empty list to hold the clean reviews
clean_test_reviews = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
for i in range(num_reviews ):
# Call our function for each one, and add the result to the list of
# clean reviews
clean_test_reviews.append(review_to_words(test["review"][i]))
return test,np.array(clean_test_reviews)
def text2vec(trainArr):
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool. 选取频数最高的5000个词作为特征
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 5000)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
train_data_features = vectorizer.fit_transform(trainArr)
# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)
return train_data_features,vocab, dist
train,x_train,y_train = load_train_data()
unlabeled_train,x_unlabeled_train = load_unlabeled_train_data()
test,x_test = load_test_data()
train_data_features,vocab, dist = text2vec(x_train)
test_data_features, test_vocab, test_dist=text2vec(x_test)
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(train_data_features, y_train)
result = forest.predict(test_data_features)
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Bag_of_Words_model1.csv", index=False, quoting=3 )