本次练习训练集只使用了有标注的数据,未标注的数据未使用,后续会更新~~
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer #计数
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix #评估准则
import nltk
from nltk.corpus import stopwords
'''
读取训练数据
'''
datafile = os.path.join('H:/word2vect_3data/labeledTrainData.tsv')
df = pd.read_csv(datafile,sep='\t',escapechar='\\')
# print('Number of reviews:{}'.format(len(df)))
# df = pd.read_csv('H:/word2vect_3data/labeledTrainData.tsv',sep='\t',escapechar='\\')
# print(len(df))
'''
对影评数据做以下处理:
1、去掉html标签
2、移除标点
3、切分成词/token
4、去掉停用词
5、重组成新的句子
'''
def display(text,title):
print(title)
print("\n----------分割线----------\n")
print(text)
raw_example = df.review[0]
# display(raw_example,'原始数据')
#
# example = BeautifulSoup(raw_example,'html.parser').get_text() #去除其中的html标签
#
# example_letters = re.sub(r'[^a-zA-Z]',' ',example) #用空格替换example中所有非字母的项,re.sub用于替换字符串中的匹配项
# # display(example_letters,'去掉标签和非字母项后')
#
# words = example_letters.lower().split() #小写归一化后分词
#
# words_stop = [w for w in words if w not in stopwords.words('english')]
# display(words_stop,"去除停用词")
#将以上处理定义在一起
eng_stopwords = set(stopwords.words('english'))
def clean_text(text):
text = BeautifulSoup(text,'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]',' ',text)
words = text.lower().split()
words = [w for w in words if w not in eng_stopwords]
return ' '.join(words) #' '中间有空格,不然所有字符都无间隔的连在一起了
'''
构建新特征
'''
df['clean_review'] = df.review.apply(clean_text)
'''
转换为bag_of_words特征形式
'''
vectorizer = CountVectorizer(max_features=5000) #取top5000
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
# print(train_data_features.shape) #输出为‘(25000, 5000)’
'''
训练分类器
'''
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features,df.sentiment)
'''
在训练集上进行predict
'''
predict_values = confusion_matrix(df.sentiment,forest.predict(train_data_features))
# print(predict_values)
'''
读取测试数据进行处理
'''
datafile_test = os.path.join('H:/word2vect_3data/testData.tsv')
df_test = pd.read_csv(datafile_test,sep='\t',escapechar='\\')
#对测试集文档进行同样的处理
df_test['clean_review'] = df_test.review.apply(clean_text)
test_data_feature = vectorizer.fit_transform(df_test.clean_review).toarray()
test_pre = forest.predict(test_data_feature)
output = pd.DataFrame({'id':df_test.id,'sentiment':test_pre})
output.to_csv('H:/word2vect_3data/submission.csv')