文章目录
商品信息可视化与文本处理结果可视化展示
启动 notebook
jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
#import lda
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.offline as py # plotly 画图和交互
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
#from bokeh.transform import factor_cmap
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
# size of training and dataset
print(train.shape) # (1482535, 8)
print(test.shape) # (693359, 7)
# different data types in the dataset: categorical (strings) and numeric
train.dtypes
'''
train_id int64
name object
item_condition_id int64
category_name object
brand_name object
price float64
shipping int64
item_description object
dtype: object
'''
train.head()
train_id | name | item_condition_id | category_name | brand_name | price | shipping | item_description | |
---|---|---|---|---|---|---|---|---|
0 | 0 | MLB Cincinnati Reds T Shirt Size XL | 3 | Men/Tops/T-shirts | NaN | 10.0 | 1 | No description yet |
1 | 1 | Razer BlackWidow Chroma Keyboard | 3 | Electronics/Computers & Tablets/Components & P... | Razer | 52.0 | 0 | This keyboard is in great condition and works ... |
2 | 2 | AVA-VIV Blouse | 1 | Women/Tops & Blouses/Blouse | Target | 10.0 | 1 | Adorable top with a hint of lace and a key hol... |
3 | 3 | Leather Horse Statues | 1 | Home/Home Décor/Home Décor Accents | NaN | 35.0 | 1 | New with tags. Leather horses. Retail for [rm]... |
4 | 4 | 24K GOLD plated rose | 1 | Women/Jewelry/Necklaces | NaN | 44.0 | 0 | Complete with certificate of authenticity |
对我们将要提供的建议价格进行处理,使用log变换
train.price.describe()
count 1.482535e+06
mean 2.673752e+01
std 3.858607e+01
min 0.000000e+00
25% 1.000000e+01
50% 1.700000e+01
75% 2.900000e+01
max 2.009000e+03
Name: price, dtype: float64
价格属性转换前和转换后的分布情况对比
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white',range=[0,250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)
plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)
plt.show()
运费承担:大概有55%的卖家是承担运费的
train.shipping.value_counts()/len(train)
0 0.552726
1 0.447274
Name: shipping, dtype: float64
运费不同情况的价格变化
prc_shipBySeller = train.loc[train.shipping==1, 'price']
prc_shipByBuyer = train.loc[train.shipping==0, 'price']
fig, ax = plt.subplots(figsize=(20,10))
ax.hist(np.log(prc_shipBySeller+1), color='#8CB4E1', alpha=1.0, bins=50,
label='Price when Seller pays Shipping')
ax.hist(np.log(prc_shipByBuyer+1), color='#007D00', alpha=0.7, bins=50,
label='Price when Buyer pays Shipping')
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
plt.legend()
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
plt.show()
用户自己付费的平均价格要低于商家包邮的
商品类别
print("There are %d unique values in the category column." % train['category_name'].nunique())
There are 1287 unique values in the category column.
# TOP 5 RAW CATEGORIES
train['category_name'].value_counts()[:5]
Women/Athletic Apparel/Pants, Tights, Leggings 60177
Women/Tops & Blouses/T-Shirts 46380
Beauty/Makeup/Face 34335
Beauty/Makeup/Lips 29910
Electronics/Video Games & Consoles/Games 26557
Name: category_name, dtype: int64
# missing categories
print("There are %d items that do not have a label." % train['category_name'].isnull().sum())
There are 6327 items that do not have a label.
类别细分
def split_cat(text):
try: return text.split("/")
except: return ("No Label", "No Label", "No Label")
train['general_cat'], train['subcat_1'], train['subcat_2'] = \
zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()
train_id | name | item_condition_id | category_name | brand_name | price | shipping | item_description | general_cat | subcat_1 | subcat_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | MLB Cincinnati Reds T Shirt Size XL | 3 | Men/Tops/T-shirts | NaN | 10.0 | 1 | No description yet | Men | Tops | T-shirts |
1 | 1 | Razer BlackWidow Chroma Keyboard | 3 | Electronics/Computers & Tablets/Components & P... | Razer | 52.0 | 0 | This keyboard is in great condition and works ... | Electronics | Computers & Tablets | Components & Parts |
2 | 2 | AVA-VIV Blouse | 1 | Women/Tops & Blouses/Blouse | Target | 10.0 | 1 | Adorable top with a hint of lace and a key hol... | Women | Tops & Blouses | Blouse |
3 | 3 | Leather Horse Statues | 1 | Home/Home Décor/Home Décor Accents | NaN | 35.0 | 1 | New with tags. Leather horses. Retail for [rm]... | Home | Home Décor | Home Décor Accents |
4 | 4 | 24K GOLD plated rose | 1 | Women/Jewelry/Necklaces | NaN | 44.0 | 0 | Complete with certificate of authenticity | Women | Jewelry | Necklaces |
# repeat the same step for the test set
test['general_cat'], test['subcat_1'], test['subcat_2'] = \
zip(*test['category_name'].apply(lambda x: split_cat(x)))
print("There are %d unique first sub-categories." % train['subcat_1'].nunique())
There are 114 unique first sub-categories.
print("There are %d unique second sub-categories." % train['subcat_2'].nunique())
There are 871 unique second sub-categories.
总的来说,我们有7个主要类别(第一个子类别中的114个和第二个子类别中的871个):女性和美容项目是最受欢迎的两类(超过50%的观察),其次是儿童和电子产品。
各大主类别分布情况
x = train['general_cat'].value_counts().index.values.astype('str')
y = train['general_cat'].value_counts().values
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))]
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
#import plotly.tools as tls
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title= 'Number of Items by Main Category',
yaxis = dict(title='Count'),
xaxis = dict(title='Category'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
subcat_1类别分布情况
x = train['subcat_1'].value_counts().index.values.astype('str')[:15]
y = train['subcat_1'].value_counts().values[:15]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))][:15]
trace1 = go.Bar(x=x, y=y, text=pct,
marker=dict(
color = y,colorscale='Portland',showscale=True,
reversescale = False
))
layout = dict(title= 'Number of Items by Sub Category (Top 15)',
yaxis = dict(title='Count'),
xaxis = dict(title='SubCategory'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
# 运动服装 化妆 女衬衫
general_cats = train['general_cat'].unique()
x = [train.loc[train['general_cat']==cat, 'price'] for cat in general_cats]
data = [go.Box(x=np.log(x[i]+1), name=general_cats[i]) for i in range(len(general_cats))]
layout = dict(title="Price Distribution by General Category",
yaxis = dict(title='Frequency'),
xaxis = dict(title='Category'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
品牌名字
print("There are %d unique brand names in the training dataset." % train['brand_name'].nunique())
There are 4809 unique brand names in the training dataset.
x = train['brand_name'].value_counts().index.values.astype('str')[:10]
y = train['brand_name'].value_counts().values[:10]
trace1 = go.Bar(x=x, y=y,
marker=dict(
color = y,colorscale='Portland',showscale=True,
reversescale = False
))
layout = dict(title= 'Top 10 Brand by Number of Items',
yaxis = dict(title='Brand Name'),
xaxis = dict(title='Count'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
商品描述
由于它是非结构化数据,因此解析这个特定项目会更具挑战性。 这是否意味着更详细和更长的描述会导致更高的出价? 我们将删除所有标点,删除一些英语停用词(即冗余词,如“a”,“the”等)以及长度小于3的任何其他词:
def wordCount(text):
# convert to lower case and strip regex
try:
# convert to lower case and strip regex
text = text.lower()
regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
txt = regex.sub(" ", text)
# tokenize
# words = nltk.word_tokenize(clean_txt)
# remove words in stop words
words = [w for w in txt.split(" ") \
if not w in stop_words.ENGLISH_STOP_WORDS and len(w)>3]
return len(words)
except:
return 0
# add a column of word counts to both the training and test set
train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x))
test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x))
train.head()
train_id | name | item_condition_id | category_name | brand_name | price | shipping | item_description | general_cat | subcat_1 | subcat_2 | desc_len | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | MLB Cincinnati Reds T Shirt Size XL | 3 | Men/Tops/T-shirts | NaN | 10.0 | 1 | No description yet | Men | Tops | T-shirts | 1 |
1 | 1 | Razer BlackWidow Chroma Keyboard | 3 | Electronics/Computers & Tablets/Components & P... | Razer | 52.0 | 0 | This keyboard is in great condition and works ... | Electronics | Computers & Tablets | Components & Parts | 14 |
2 | 2 | AVA-VIV Blouse | 1 | Women/Tops & Blouses/Blouse | Target | 10.0 | 1 | Adorable top with a hint of lace and a key hol... | Women | Tops & Blouses | Blouse | 8 |
3 | 3 | Leather Horse Statues | 1 | Home/Home Décor/Home Décor Accents | NaN | 35.0 | 1 | New with tags. Leather horses. Retail for [rm]... | Home | Home Décor | Home Décor Accents | 14 |
4 | 4 | 24K GOLD plated rose | 1 | Women/Jewelry/Necklaces | NaN | 44.0 | 0 | Complete with certificate of authenticity | Women | Jewelry | Necklaces | 3 |
df = train.groupby('desc_len')['price'].mean().reset_index()
名字长短与价格有关吗
trace1 = go.Scatter(
x = df['desc_len'],
y = np.log(df['price']+1),
mode = 'lines+markers',
name = 'lines+markers'
)
layout = dict(title= 'Average Log(Price) by Description Length',
yaxis = dict(title='Average Log(Price)'),
xaxis = dict(title='Description Length'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
…
3.4671136297685132, 3.454598158389271, 3.4523128824675857, 3.740839193044647, 3.264486336120253, 3.3081069585961433, 2.909932283750658, 3.1179499062782403, 2.833213344056216, 2.515678308454754, 3.2217288938506075, 2.8526314299133175, 2.8716796248840124, 3.713572066704308, 2.7343675094195836, 3.056356895370426, 3.2121868367174042, 2.8791984572980396, 2.9444389791664403], “mode”: “lines+markers”, “name”: “lines+markers”}], {“title”: “Average Log(Price) by Description Length”, “yaxis”: {“title”: “Average Log(Price)”}, “xaxis”: {“title”: “Description Length”}}, {“showLink”: true, “linkText”: “Export to plot.ly”})});
train.item_description.isnull().sum() # 4
# remove missing values in item description
train = train[pd.notnull(train['item_description'])]
# create a dictionary of words for each category
tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
cat_desc = dict()
for cat in general_cats:
text = " ".join(train.loc[train['general_cat']==cat, 'item_description'].values)
cat_desc[cat] = tokenize.tokenize(text)
# flat list of all words combined
flat_lst = [item for sublist in list(cat_desc.values()) for item in sublist]
allWordsCount = Counter(flat_lst)
all_top10 = allWordsCount.most_common(20)
x = [w[0] for w in all_top10]
y = [w[1] for w in all_top10]
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title= 'Word Frequency',
yaxis = dict(title='Count'),
xaxis = dict(title='Word'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
文本数据处理
- 分词
- 去停用词
- 过滤筛选
stop = set(stopwords.words('english'))
def tokenize(text):
"""
sent_tokenize(): segment text into sentences
word_tokenize(): break sentences into words
"""
try:
regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
text = regex.sub(" ", text) # remove punctuation
tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
tokens = []
for token_by_sent in tokens_:
tokens += token_by_sent
tokens = list(filter(lambda t: t.lower() not in stop, tokens))
filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
return filtered_tokens
except TypeError as e: print(text,e)
# apply the tokenizer into the item descriptipn column
train['tokens'] = train['item_description'].map(tokenize)
test['tokens'] = test['item_description'].map(tokenize)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
结果打印
for description, tokens in zip(train['item_description'].head(),
train['tokens'].head()):
print('description:', description)
print('tokens:', tokens)
print()
description: No description yet
tokens: ['description', 'yet']
description: This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.
tokens: ['keyboard', 'great', 'condition', 'works', 'like', 'came', 'box', 'ports', 'tested', 'work', 'perfectly', 'lights', 'customizable', 'via', 'razer', 'synapse', 'app']
description: Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!
tokens: ['adorable', 'top', 'hint', 'lace', 'key', 'hole', 'back', 'pale', 'pink', 'also', 'available', 'white']
description: New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage
tokens: ['new', 'tags', 'leather', 'horses', 'retail', 'stand', 'foot', 'high', 'sold', 'pair', 'questions', 'please', 'ask', 'free', 'shipping', 'got', 'storage']
description: Complete with certificate of authenticity
tokens: ['complete', 'certificate', 'authenticity']
词云展示
# build dictionary with key=category and values as all the descriptions related.
cat_desc = dict()
for cat in general_cats:
text = " ".join(train.loc[train['general_cat']==cat, 'item_description'].values)
cat_desc[cat] = tokenize(text)
# find the most common words for the top 4 categories
women100 = Counter(cat_desc['Women']).most_common(100)
beauty100 = Counter(cat_desc['Beauty']).most_common(100)
kids100 = Counter(cat_desc['Kids']).most_common(100)
electronics100 = Counter(cat_desc['Electronics']).most_common(100)
def generate_wordcloud(tup):
wordcloud = WordCloud(background_color='white',
max_words=50, max_font_size=40,
random_state=42
).generate(str(tup))
return wordcloud
fig,axes = plt.subplots(2, 2, figsize=(30, 15))
ax = axes[0, 0]
ax.imshow(generate_wordcloud(women100), interpolation="bilinear")
ax.axis('off')
ax.set_title("Women Top 100", fontsize=30)
ax = axes[0, 1]
ax.imshow(generate_wordcloud(beauty100))
ax.axis('off')
ax.set_title("Beauty Top 100", fontsize=30)
ax = axes[1, 0]
ax.imshow(generate_wordcloud(kids100))
ax.axis('off')
ax.set_title("Kids Top 100", fontsize=30)
ax = axes[1, 1]
ax.imshow(generate_wordcloud(electronics100))
ax.axis('off')
ax.set_title("Electronic Top 100", fontsize=30)
Text(0.5,1,‘Electronic Top 100’)
tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10,
max_features=180000,
tokenizer=tokenize,
ngram_range=(1, 2))
all_desc = np.append(train['item_description'].values, test['item_description'].values)
vz = vectorizer.fit_transform(list(all_desc))
vz.shape
(2175890, 180000)
# create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(
dict(tfidf), orient='index')
tfidf.columns = ['tfidf']
以下是tfidf得分最低的10个标记,这并不令人惊讶,是我们无法区分一种描述与另一种描述的非常通用的词。
tfidf.sort_values(by=['tfidf'], ascending=True).head(10)
tfidf | |
---|---|
new | 2.175653 |
size | 2.330674 |
brand | 2.755660 |
condition | 2.799306 |
brand new | 2.874418 |
free | 2.903426 |
shipping | 3.070592 |
worn | 3.107882 |
used | 3.165310 |
never | 3.276901 |
以下是tfidf分数最高的10个标记,其中包含很多具体的词,通过查看它们,我们可以猜出它们所属的分类:
tfidf.sort_values(by=['tfidf'], ascending=False).head(10)
tfidf | |
---|---|
postnatal | 13.195054 |
subdrip rda | 13.195054 |
lmt | 13.195054 |
lbs length | 13.195054 |
place step | 13.195054 |
light volts | 13.195054 |
thumb point | 13.195054 |
wedgwood | 13.195054 |
novelty bill | 13.195054 |
colour brow | 13.195054 |
鉴于我们的tfidf矩阵的高维度,我们需要使用奇异值分解(SVD)技术来降低它们的维数。 为了使我们的词汇可视化,我们接下来可以使用t-SNE将维度从50减小到2. t-SNE更适合将维度减少到2或3。
trn = train.copy()
tst = test.copy()
trn['is_train'] = 1
tst['is_train'] = 0
sample_sz = 15000
combined_df = pd.concat([trn, tst])
combined_sample = combined_df.sample(n=sample_sz)
vz_sample = vectorizer.fit_transform(list(combined_sample['item_description']))
from sklearn.decomposition import TruncatedSVD
n_comp=30
svd = TruncatedSVD(n_components=n_comp, random_state=42)
svd_tfidf = svd.fit_transform(vz_sample)
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=500)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.021s...
[t-SNE] Computed neighbors for 15000 samples in 7.434s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
…
[t-SNE] Computed conditional probabilities for sample 14000 / 15000
[t-SNE] Computed conditional probabilities for sample 15000 / 15000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.433113
[t-SNE] Error after 500 iterations: 1.920846
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600,
title="tf-idf clustering of the item description",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
combined_sample.reset_index(inplace=True, drop=True)
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['description'] = combined_sample['item_description']
tfidf_df['tokens'] = combined_sample['tokens']
tfidf_df['category'] = combined_sample['general_cat']
plot_tfidf.scatter(x='x', y='y', source=tfidf_df, alpha=0.7)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"description": "@description", "tokens": "@tokens", "category":"@category"}
show(plot_tfidf)
K-Means聚类
from sklearn.cluster import MiniBatchKMeans
num_clusters = 10 # need to be selected wisely
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
init='k-means++',
n_init=1,
init_size=1000, batch_size=1000, verbose=0, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
# repeat the same steps for the sample
kmeans = kmeans_model.fit(vz_sample)
kmeans_clusters = kmeans.predict(vz_sample)
kmeans_distances = kmeans.transform(vz_sample)
# reduce dimension to 2 using tsne
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.013s...
[t-SNE] Computed neighbors for 15000 samples in 1.513s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
[t-SNE] Computed conditional probabilities for sample 3000 / 15000
[t-SNE] Computed conditional probabilities for sample 4000 / 15000
[t-SNE] Computed conditional probabilities for sample 5000 / 15000
[t-SNE] Computed conditional probabilities for sample 6000 / 15000
[t-SNE] Computed conditional probabilities for sample 7000 / 15000
[t-SNE] Computed conditional probabilities for sample 8000 / 15000
[t-SNE] Computed conditional probabilities for sample 9000 / 15000
[t-SNE] Computed conditional probabilities for sample 10000 / 15000
[t-SNE] Computed conditional probabilities for sample 11000 / 15000
[t-SNE] Computed conditional probabilities for sample 12000 / 15000
[t-SNE] Computed conditional probabilities for sample 13000 / 15000
[t-SNE] Computed conditional probabilities for sample 14000 / 15000
[t-SNE] Computed conditional probabilities for sample 15000 / 15000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 81.781433
[t-SNE] Error after 500 iterations: 1.791430
#combined_sample.reset_index(drop=True, inplace=True)
kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['description'] = combined_sample['item_description']
kmeans_df['category'] = combined_sample['general_cat']
#kmeans_df['cluster']=kmeans_df.cluster.astype(str).astype('category')
plot_kmeans = bp.figure(plot_width=700, plot_height=600,
title="KMeans clustering of the description",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
kmeans_clusters # array([8, 4, 6, ..., 5, 4, 5])
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey', '8': 'brown', '9': 'orange'}
def get_color(num):
if num == 0:
return 'red'
elif num == 1:
return 'green'
elif num == 2:
return 'blue'
elif num == 3:
return 'black'
elif num == 4:
return 'yellow'
elif num == 5:
return 'pink'
elif num == 6:
return 'purple'
elif num == 7:
return 'grey'
elif num == 8:
return 'brown'
elif num == 9:
return 'orange'
color = pd.Series(kmeans_clusters).apply(get_color)
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow'}
source = ColumnDataSource(data=dict(x=kmeans_df['x'], y=kmeans_df['y'],
color=color,
description=kmeans_df['description'],
category=kmeans_df['category'],
cluster=kmeans_df['cluster']))
plot_kmeans.scatter(x='x', y='y', color='color', source=source)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"description": "@description", "category": "@category", "cluster":"@cluster" }
show(plot_kmeans)
LDA模型
输入得是bag of words
cvectorizer = CountVectorizer(min_df=4,
max_features=180000,
tokenizer=tokenize,
ngram_range=(1,2))
cvz = cvectorizer.fit_transform(combined_sample['item_description'])
lda_model = LatentDirichletAllocation(n_components=10,
learning_method='online',
max_iter=20,
random_state=42)
X_topics = lda_model.fit_transform(cvz)
n_top_words = 10
topic_summaries = []
topic_word = lda_model.components_ # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
topic_summaries.append(' '.join(topic_words))
print('Topic {}: {}'.format(i, ' | '.join(topic_words)))
Topic 0: green | silver | pink | blue | one | white | matte | black | purple | set
Topic 1: inches | makeup | made | dunn | care | rae | rae dunn | bag | long | packaging
Topic 2: case | iphone | plus | size | picture | quality | know | high | note | let
Topic 3: like | new | price | size | bundle | like new | firm | dress | long | price firm
Topic 4: condition | great | size | good | used | worn | free | great condition | home | good condition
Topic 5: new | brand | brand new | used | never | never used | box | color | new never | authentic
Topic 6: description | yet | description yet | gold | charger | apple | comes | bracelet | disney | included
Topic 7: please | shipping | free | bundle | items | price | item | ask | new | ship
Topic 8: size | new | tags | brand | worn | black | small | brand new | pink | medium
Topic 9: shipping | free | free shipping | great | fast | price | game | clean | scratches | includes
# reduce dimension to 2 using tsne
tsne_lda = tsne_model.fit_transform(X_topics)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.013s...
[t-SNE] Computed neighbors for 15000 samples in 1.870s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
…
[t-SNE] Computed conditional probabilities for sample 11000 / 15000
[t-SNE] Computed conditional probabilities for sample 12000 / 15000
[t-SNE] Computed conditional probabilities for sample 13000 / 15000
[t-SNE] Computed conditional probabilities for sample 14000 / 15000
[t-SNE] Computed conditional probabilities for sample 15000 / 15000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 86.734573
[t-SNE] Error after 500 iterations: 2.042183
unnormalized = np.matrix(X_topics)
doc_topic = unnormalized/unnormalized.sum(axis=1)
lda_keys = []
for i, tweet in enumerate(combined_sample['item_description']):
lda_keys += [doc_topic[i].argmax()]
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['description'] = combined_sample['item_description']
lda_df['category'] = combined_sample['general_cat']
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)
plot_lda = bp.figure(plot_width=700,
plot_height=600,
title="LDA topic visualization",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey', '8': 'brown', '9': 'orange'}
def get_color(num):
if num == 0:
return 'red'
elif num == 1:
return 'green'
elif num == 2:
return 'blue'
elif num == 3:
return 'black'
elif num == 4:
return 'yellow'
elif num == 5:
return 'pink'
elif num == 6:
return 'purple'
elif num == 7:
return 'grey'
elif num == 8:
return 'brown'
elif num == 9:
return 'orange'
color = pd.Series(lda_keys).apply(get_color)
source = ColumnDataSource(data=dict(x=lda_df['x'], y=lda_df['y'],
color=color,
description=lda_df['description'],
topic=lda_df['topic'],
category=lda_df['category']))
plot_lda.scatter(source=source, x='x', y='y', color='color')
hover = plot_kmeans.select(dict(type=HoverTool))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"description":"@description",
"topic":"@topic", "category":"@category"}
show(plot_lda)