import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import re
import string
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings('ignore')
review_df = pd.read_excel('amazon_review.xlsx','Sheet1',index_col=None)
review_df.head(3)
data = review_df[['reviews.rating','reviews.text']]
#只筛选需要分析的评分值和评论内容
data.info()
data.isnull().sum()
data.dropna(inplace=True)
import random
samples=4
for i in range(samples):
n=random.choice(range(data.shape[0]))
print(n)
print('text:{}\n rating:{}\n'.format(data['reviews.text'][n],data['reviews.rating'][n]))
#随机查看4行数据
data.describe()
data2 = data[data['reviews.rating'].isin([1,2,3,4,5]) ]
data2['reviews.rating'].value_counts()
#查看该列的值以及值对应的数量
data2['reviews.rating'].value_counts().sort
Amazon_Review_数据处理与分析
最新推荐文章于 2023-02-08 15:30:37 发布
python
摘要由CSDN通过智能技术生成