3.商品可视化展示与文本处理
3.1.依赖包安装
准备工作,先安装scikit-learn,scipy,numpy等等
(base) C:\Users\toto>pip install scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: scikit-learn in d:\installed\anaconda\lib\site-packages (0.23.2)
Requirement already satisfied: joblib>=0.11 in d:\installed\anaconda\lib\site-packages (from scikit-learn) (0.17.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\installed\anaconda\lib\site-packages (from scikit-learn) (2.1.0)
Requirement already satisfied: scipy>=0.19.1 in d:\installed\anaconda\lib\site-packages (from scikit-learn) (1.5.2)
Requirement already satisfied: numpy>=1.13.3 in d:\installed\anaconda\lib\site-packages (from scikit-learn) (1.19.2)
(base) C:\Users\toto>pip install scipy -i https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: scipy in d:\installed\anaconda\lib\site-packages (1.5.2)
Requirement already satisfied: numpy>=1.14.5 in d:\installed\anaconda\lib\site-packages (from scipy) (1.19.2)
(base) C:\Users\toto>pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: numpy in d:\installed\anaconda\lib\site-packages (1.19.2)
(base) C:\Users\toto> 等等.....
3.2.数据准备
train.tsv
test.tsv
3.3.代码实现
# 启动:jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
# 数据导包
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
#import lda
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# pip install plotly --default-timeout=1000 -i https://pypi.tuna.tsinghua.edu.cn/simple
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
#from bokeh.transform import factor_cmap
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)
train = pd.read_csv('train.tsv',sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
# size of training and dataset
print(train.shape)
print(test.shape)
'''
输出结果:
(1482535, 8)
(693359, 7)
'''
print('--------------------------------')
# different data types in the dataset: categorical (strings) and numeric
print(train.dtypes)
'''
train_id int64
name object
item_condition_id int64
category_name object
brand_name object
price float64
shipping int64
item_description object
dtype: object
'''
print('--------------------------------')
print(train.head())
print('--------------------------------')
# 对我们将要提供的建议加个进行处理,使用log变换
print(train.price.describe())
'''
输出结果:
count 1.482535e+06
mean 2.673752e+01
std 3.858607e+01
min 0.000000e+00
25% 1.000000e+01
50% 1.700000e+01
75% 2.900000e+01
max 2.009000e+03
Name: price, dtype: float64
'''
# 价格属性转换前和转换后的分布情况对比
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white',range=[0,250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)
plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)
plt.show()
运费承担:大概有55%的卖家是承担运费的。
print(train.shipping.value_counts() / len(train))
'''
输出结果:
0 0.552726
1 0.447274
Name: shipping, dtype: float64
'''
# 看一下运费不同情况的价格变化
prc_shipBySeller = train.loc[train