IMDB影评得分估计竞赛代码
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 2 11:11:39 2017
@author: yichengfan
"""
import pandas as pd
train = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\testData.tsv', delimiter='\t')
##查看一下各自的前几条数据
train.head()
'''
id sentiment review
0 5814_8 1 With all this stuff going down at the moment w...
1 2381_9 1 \The Classic War of the Worlds\" by Timothy Hi...
2 7759_3 0 The film starts with a manager (Nicholas Bell)...
3 3630_4 0 It must be assumed that those who praised this...
4 9495_8 1 Superbly trashy and wondrously unpretentious 8...
'''
test.head()
'''
id review
0 12311_10 Naturally in a film who's main themes are of m...
1 8348_2 This movie is a disaster within a disaster fil...
2 5828_4 All in all, this is a movie for kids. We saw i...
3 7186_2 Afraid of the Dark left me with the impression...
4 12128_7 A very accurate depiction of small time mob li...
'''
#从bs4导入beautifulSoup用于整洁原始文本
from bs4 import BeautifulSoup
#从nltk.corpus 里导入停用词列表(nltk自然语言处理包)
from nltk.corpus import stopwords
import re
#定义函数,完成对原始评论的三项数据处理任务
def review_to_text(review, remove_stopwords):
#去掉html标记
raw_text = BeautifulSoup(review, 'html').get_text()
#去掉非字母字符
letters = re.sub('[^a-zA-Z]', ' ', raw_text)
words = letters.lower().split()
#如果remove_stopwords被激活,则去掉评论里的停用词
if remove_stopwords:
stop_words = set(stopwords.words('english'))
words = [w for w in words if w not in stop_words]
#返回每条评论经此三项预处理任务的词汇列表
return words
#调用函数处理数据