文本处理结果可视化展示

## 商品信息可视化与文本处理结果可视化展示
#数据包导入
# 启动:jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
# import lda

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")

from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import plotly.offline as py

py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
# from bokeh.transform import factor_cmap

import warnings

warnings.filterwarnings('ignore')
import logging

logging.getLogger("lda").setLevel(logging.WARNING)
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
# size of training and dataset
print(train.shape)
print(test.shape)
# different data types in the dataset: categorical (strings) and numeric
train.dtypes
train.head()
# 对我们将要提供的建议价格进行处理,使用log变换
train.price.describe()
# 价格属性转换前和转换后的分布情况对比
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(20, 10), edgecolor='white', range=[0, 250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)

plt.subplot(1, 2, 2)
np.log(train['price'] + 1).plot.hist(bins=50, figsize=(20, 10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)
plt.show()
# 运费承担:大概有55 % 的卖家是承担运费的
train.shipping.value_counts() / len(train)
# 看一下运费不同情况的价格变化
prc_shipBySeller = train.loc[train.shipping == 1, 'price']
prc_shipByBuyer = train.loc[train.shipping == 0, 'price']
fig, ax = plt.subplots(figsize=(20, 10))
ax.hist(np.log(prc_shipBySeller + 1), color='#8CB4E1', alpha=1.0, bins=50,
        label='Price when Seller pays Shipping')
ax.hist(np.log(prc_shipByBuyer + 1), color='#007D00', alpha=0.7, bins=50,
        label='Price when Buyer pays Shipping')
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
plt.legend()
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
plt.show()

# 貌似用户自己付费的平均价格要低于商家包邮的。。。
# 商品类别
print("There are %d unique values in the category column." % train['category_name'].nunique())
# TOP 5 RAW CATEGORIES
train['category_name'].value_counts()[:5]
# missing categories
print("There are %d items that do not have a label." % train['category_name'].isnull().sum())
# 类别细分一下


def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("No Label", "No Label", "No Label")


train['general_cat'], train['subcat_1'], train['subcat_2'] = \
    zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()
# repeat the same step for the test set
test['general_cat'], test['subcat_1'], test['subcat_2'] = \
    zip(*test['category_name'].apply(lambda x: split_cat(x)))
print("There are %d unique first sub-categories." % train['subcat_1'].nunique())
print("There are %d unique second sub-categories." % train['subcat_2'].nunique())
# 总的来说,我们有7个主要类别(第一个子类别中的114个和第二个子类别中的871个):女性和美容项目是最受欢迎的两类(超过50%的观察),其次是儿童和电子产品。
#
# 各大主类别分布情况:
x = train['general_cat'].value_counts().index.values.astype('str')
y = train['general_cat'].value_counts().values
pct = [("%.2f" % (v * 100)) + "%" for v in (y / len(train))]
# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls

trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title='Number of Items by Main Category',
              yaxis=dict(title='Count'),
              xaxis=dict(title='Category'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)
# subcat_1类别分布情况
x = train['subcat_1'].value_counts().index.values.astype('str')[:15]
y = train['subcat_1'].value_counts().values[:15]
pct = [("%.2f" % (v * 100)) + "%" for v in (y / len(train))][:15]
trace1 = go.Bar(x=x, y=y, text=pct,
                marker=dict(
                    color=y, colorscale='Portland', showscale=True,
                    reversescale=False
                ))
layout = dict(title='Number of Items by Sub Category (Top 15)',
              yaxis=dict(title='Count'),
              xaxis=dict(title='SubCategory'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)
# 运动服装 化妆 女衬衫
general_cats = train['general_cat'].unique()
x = [train.loc[train['general_cat'] == cat, 'price'] for cat in general_cats]
data = [go.Box(x=np.log(x[i] + 1), name=general_cats[i]) for i in range(len(general_cats))]
layout = dict(title="Price Distribution by General Category",
              yaxis=dict(title='Frequency'),
              xaxis=dict(title='Category'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
# 品牌名字
print("There are %d unique brand names in the training dataset." % train['brand_name'].nunique())
x = train['brand_name'].value_counts().index.values.astype('str')[:10]
y = train['brand_name'].value_counts().values[:10]
trace1 = go.Bar(x=x, y=y,
                marker=dict(
                    color=y, colorscale='Portland', showscale=True,
                    reversescale=False
                ))
layout = dict(title='Top 10 Brand by Number of Items',
              yaxis=dict(title='Brand Name'),
              xaxis=dict(title='Count'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)
# 商品描述
# 由于它是非结构化数据,因此解析这个特定项目会更具挑战性。 这是否意味着更详细和更长的描述会导致更高的出价? 我们将删除所有标点,删除一些英语停用词(即冗余词,如“a”,“the”等)以及长度小于3的任何其他词:

def wordCount(text):
    # convert to lower case and strip regex
    try:
        # convert to lower case and strip regex
        text = text.lower()
        regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        txt = regex.sub(" ", text)
        # tokenize
        # words = nltk.word_tokenize(clean_txt)
        # remove words in stop words
        words = [w for w in txt.split(" ") \
                 if not w in stop_words.ENGLISH_STOP_WORDS and len(w) > 3]
        return len(words)
    except:
        return 0


# add a column of word counts to both the training and test set
train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x))
test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x))
train.head()
df = train.groupby('desc_len')['price'].mean().reset_index()
# 名字长短与价格有关吗
trace1 = go.Scatter(
    x=df['desc_len'],
    y=np.log(df['price'] + 1),
    mode='lines+markers',
    name='lines+markers'
)
layout = dict(title='Average Log(Price) by Description Length',
              yaxis=dict(title='Average Log(Price)'),
              xaxis=dict(title='Description Length'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)
train.item_description.isnull().sum()
# remove missing values in item description
train = train[pd.notnull(train['item_description'])]
# create a dictionary of words for each category

tokenize = nltk.data.load('tokenizers/punkt/english.pickle')

cat_desc = dict()
for cat in general_cats:
    text = " ".join(train.loc[train['general_cat'] == cat, 'item_description'].values)
    cat_desc[cat] = tokenize.tokenize(text)

# flat list of all words combined
flat_lst = [item for sublist in list(cat_desc.values()) for item in sublist]
allWordsCount = Counter(flat_lst)
all_top10 = allWordsCount.most_common(20)
x = [w[0] for w in all_top10]
y = [w[1] for w in all_top10]
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title='Word Frequency',
              yaxis=dict(title='Count'),
              xaxis=dict(title='Word'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)
# 文本数据处理
#
# - 分词
# - 去停用词
# - 过滤筛选
stop = set(stopwords.words('english'))


def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try:
        regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text)  # remove punctuation

        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]

        return filtered_tokens

    except TypeError as e:
        print(text, e)


# apply the tokenizer into the item descriptipn column
train['tokens'] = train['item_description'].map(tokenize)
test['tokens'] = test['item_description'].map(tokenize)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
# 结果打印
for description, tokens in zip(train['item_description'].head(),
                               train['tokens'].head()):
    print('description:', description)
    print('tokens:', tokens)
    print()
# 词云展示
# build dictionary with key=category and values as all the descriptions related.
cat_desc = dict()
for cat in general_cats:
    text = " ".join(train.loc[train['general_cat'] == cat, 'item_description'].values)
    cat_desc[cat] = tokenize(text)

# find the most common words for the top 4 categories
women100 = Counter(cat_desc['Women']).most_common(100)
beauty100 = Counter(cat_desc['Beauty']).most_common(100)
kids100 = Counter(cat_desc['Kids']).most_common(100)
electronics100 = Counter(cat_desc['Electronics']).most_common(100)


def generate_wordcloud(tup):
    wordcloud = WordCloud(background_color='white',
                          max_words=50, max_font_size=40,
                          random_state=42
                          ).generate(str(tup))
    return wordcloud


fig, axes = plt.subplots(2, 2, figsize=(30, 15))

ax = axes[0, 0]
ax.imshow(generate_wordcloud(women100), interpolation="bilinear")
ax.axis('off')
ax.set_title("Women Top 100", fontsize=30)

ax = axes[0, 1]
ax.imshow(generate_wordcloud(beauty100))
ax.axis('off')
ax.set_title("Beauty Top 100", fontsize=30)

ax = axes[1, 0]
ax.imshow(generate_wordcloud(kids100))
ax.axis('off')
ax.set_title("Kids Top 100", fontsize=30)

ax = axes[1, 1]
ax.imshow(generate_wordcloud(electronics100))
ax.axis('off')
ax.set_title("Electronic Top 100", fontsize=30)
tf - idf
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10,
                             max_features=180000,
                             tokenizer=tokenize,
                             ngram_range=(1, 2))
all_desc = np.append(train['item_description'].values, test['item_description'].values)
vz = vectorizer.fit_transform(list(all_desc))
vz.shape
#  create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(
    dict(tfidf), orient='index')
tfidf.columns = ['tfidf']
# 以下是tfidf得分最低的10个标记,这并不令人惊讶,是我们无法区分一种描述与另一种描述的非常通用的词。
tfidf.sort_values(by=['tfidf'], ascending=True).head(10)
# 以下是tfidf分数最高的10个标记,其中包含很多具体的词,通过查看它们,我们可以猜出它们所属的分类:
tfidf.sort_values(by=['tfidf'], ascending=False).head(10)
# 鉴于我们的tfidf矩阵的高维度,我们需要使用奇异值分解(SVD)技术来降低它们的维数。 为了使我们的词汇可视化,我们接下来可以使用t - SNE将维度从50减小到2.t - SNE更适合将维度减少到2或3。
trn = train.copy()
tst = test.copy()
trn['is_train'] = 1
tst['is_train'] = 0

sample_sz = 15000

combined_df = pd.concat([trn, tst])
combined_sample = combined_df.sample(n=sample_sz)
vz_sample = vectorizer.fit_transform(list(combined_sample['item_description']))
from sklearn.decomposition import TruncatedSVD

n_comp = 30
svd = TruncatedSVD(n_components=n_comp, random_state=42)
svd_tfidf = svd.fit_transform(vz_sample)
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=500)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600,
                       title="tf-idf clustering of the item description",
                       tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                       x_axis_type=None, y_axis_type=None, min_border=1)

combined_sample.reset_index(inplace=True, drop=True)
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['description'] = combined_sample['item_description']
tfidf_df['tokens'] = combined_sample['tokens']
tfidf_df['category'] = combined_sample['general_cat']
plot_tfidf.scatter(x='x', y='y', source=tfidf_df, alpha=0.7)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips = {"description": "@description", "tokens": "@tokens", "category": "@category"}
show(plot_tfidf)
# K - Means聚类
from sklearn.cluster import MiniBatchKMeans

num_clusters = 10  # need to be selected wisely
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
                               init='k-means++',
                               n_init=1,
                               init_size=1000, batch_size=1000, verbose=0, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

# repeat the same steps for the sample
kmeans = kmeans_model.fit(vz_sample)
kmeans_clusters = kmeans.predict(vz_sample)
kmeans_distances = kmeans.transform(vz_sample)
# reduce dimension to 2 using tsne
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
# combined_sample.reset_index(drop=True, inplace=True)
kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['description'] = combined_sample['item_description']
kmeans_df['category'] = combined_sample['general_cat']
# kmeans_df['cluster']=kmeans_df.cluster.astype(str).astype('category')
plot_kmeans = bp.figure(plot_width=700, plot_height=600,
                        title="KMeans clustering of the description",
                        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                        x_axis_type=None, y_axis_type=None, min_border=1)
kmeans_clusters
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey',
            '8': 'brown', '9': 'orange'}


def get_color(num):
    if num == 0:
        return 'red'
    elif num == 1:
        return 'green'
    elif num == 2:
        return 'blue'
    elif num == 3:
        return 'black'
    elif num == 4:
        return 'yellow'
    elif num == 5:
        return 'pink'
    elif num == 6:
        return 'purple'
    elif num == 7:
        return 'grey'
    elif num == 8:
        return 'brown'
    elif num == 9:
        return 'orange'


color = pd.Series(kmeans_clusters).apply(get_color)
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow'}
source = ColumnDataSource(data=dict(x=kmeans_df['x'], y=kmeans_df['y'],
                                    color=color,
                                    description=kmeans_df['description'],
                                    category=kmeans_df['category'],
                                    cluster=kmeans_df['cluster']))

plot_kmeans.scatter(x='x', y='y', color='color', source=source)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips = {"description": "@description", "category": "@category", "cluster": "@cluster"}
show(plot_kmeans)
# LDA模型
# 输入得是bag
# of
# words
cvectorizer = CountVectorizer(min_df=4,
                              max_features=180000,
                              tokenizer=tokenize,
                              ngram_range=(1, 2))
cvz = cvectorizer.fit_transform(combined_sample['item_description'])
lda_model = LatentDirichletAllocation(n_components=10,
                                      learning_method='online',
                                      max_iter=20,
                                      random_state=42)
X_topics = lda_model.fit_transform(cvz)
n_top_words = 10
topic_summaries = []

topic_word = lda_model.components_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' | '.join(topic_words)))
# reduce dimension to 2 using tsne
tsne_lda = tsne_model.fit_transform(X_topics)
unnormalized = np.matrix(X_topics)
doc_topic = unnormalized / unnormalized.sum(axis=1)

lda_keys = []
for i, tweet in enumerate(combined_sample['item_description']):
    lda_keys += [doc_topic[i].argmax()]

lda_df = pd.DataFrame(tsne_lda, columns=['x', 'y'])
lda_df['description'] = combined_sample['item_description']
lda_df['category'] = combined_sample['general_cat']
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)
plot_lda = bp.figure(plot_width=700,
                     plot_height=600,
                     title="LDA topic visualization",
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey',
            '8': 'brown', '9': 'orange'}


def get_color(num):
    if num == 0:
        return 'red'
    elif num == 1:
        return 'green'
    elif num == 2:
        return 'blue'
    elif num == 3:
        return 'black'
    elif num == 4:
        return 'yellow'
    elif num == 5:
        return 'pink'
    elif num == 6:
        return 'purple'
    elif num == 7:
        return 'grey'
    elif num == 8:
        return 'brown'
    elif num == 9:
        return 'orange'


color = pd.Series(lda_keys).apply(get_color)
source = ColumnDataSource(data=dict(x=lda_df['x'], y=lda_df['y'],
                                    color=color,
                                    description=lda_df['description'],
                                    topic=lda_df['topic'],
                                    category=lda_df['category']))

plot_lda.scatter(source=source, x='x', y='y', color='color')
hover = plot_kmeans.select(dict(type=HoverTool))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"description": "@description",
                  "topic": "@topic", "category": "@category"}
show(plot_lda)



  • 7
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值