## 商品信息可视化与文本处理结果可视化展示 #数据包导入 # 启动:jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 import nltk import string import re import numpy as np import pandas as pd import pickle # import lda import matplotlib.pyplot as plt import seaborn as sns sns.set(style="white") from nltk.stem.porter import * from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from collections import Counter from wordcloud import WordCloud from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import plotly.offline as py py.init_notebook_mode(connected=True) import plotly.graph_objs as go import plotly.tools as tls import bokeh.plotting as bp from bokeh.models import HoverTool, BoxSelectTool from bokeh.models import ColumnDataSource from bokeh.plotting import figure, show, output_notebook # from bokeh.transform import factor_cmap import warnings warnings.filterwarnings('ignore') import logging logging.getLogger("lda").setLevel(logging.WARNING) train = pd.read_csv('train.tsv', sep='\t') test = pd.read_csv('test.tsv', sep='\t') # size of training and dataset print(train.shape) print(test.shape) # different data types in the dataset: categorical (strings) and numeric train.dtypes train.head() # 对我们将要提供的建议价格进行处理,使用log变换 train.price.describe() # 价格属性转换前和转换后的分布情况对比 plt.subplot(1, 2, 1) (train['price']).plot.hist(bins=50, figsize=(20, 10), edgecolor='white', range=[0, 250]) plt.xlabel('price+', fontsize=17) plt.ylabel('frequency', fontsize=17) plt.tick_params(labelsize=15) plt.title('Price Distribution - Training Set', fontsize=17) plt.subplot(1, 2, 2) np.log(train['price'] + 1).plot.hist(bins=50, figsize=(20, 10), edgecolor='white') plt.xlabel('log(price+1)', fontsize=17) plt.ylabel('frequency', fontsize=17) plt.tick_params(labelsize=15) plt.title('Log(Price) Distribution - Training Set', fontsize=17) plt.show() # 运费承担:大概有55 % 的卖家是承担运费的 train.shipping.value_counts() / len(train) # 看一下运费不同情况的价格变化 prc_shipBySeller = train.loc[train.shipping == 1, 'price'] prc_shipByBuyer = train.loc[train.shipping == 0, 'price'] fig, ax = plt.subplots(figsize=(20, 10)) ax.hist(np.log(prc_shipBySeller + 1), color='#8CB4E1', alpha=1.0, bins=50, label='Price when Seller pays Shipping') ax.hist(np.log(prc_shipByBuyer + 1), color='#007D00', alpha=0.7, bins=50, label='Price when Buyer pays Shipping') ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin') plt.legend() plt.xlabel('log(price+1)', fontsize=17) plt.ylabel('frequency', fontsize=17) plt.title('Price Distribution by Shipping Type', fontsize=17) plt.tick_params(labelsize=15) plt.show() # 貌似用户自己付费的平均价格要低于商家包邮的。。。 # 商品类别 print("There are %d unique values in the category column." % train['category_name'].nunique()) # TOP 5 RAW CATEGORIES train['category_name'].value_counts()[:5] # missing categories print("There are %d items that do not have a label." % train['category_name'].isnull().sum()) # 类别细分一下 def split_cat(text): try: return text.split("/") except: return ("No Label", "No Label", "No Label") train['general_cat'], train['subcat_1'], train['subcat_2'] = \ zip(*train['category_name'].apply(lambda x: split_cat(x))) train.head() # repeat the same step for the test set test['general_cat'], test['subcat_1'], test['subcat_2'] = \ zip(*test['category_name'].apply(lambda x: split_cat(x))) print("There are %d unique first sub-categories." % train['subcat_1'].nunique()) print("There are %d unique second sub-categories." % train['subcat_2'].nunique()) # 总的来说,我们有7个主要类别(第一个子类别中的114个和第二个子类别中的871个):女性和美容项目是最受欢迎的两类(超过50%的观察),其次是儿童和电子产品。 # # 各大主类别分布情况: x = train['general_cat'].value_counts().index.values.astype('str') y = train['general_cat'].value_counts().values pct = [("%.2f" % (v * 100)) + "%" for v in (y / len(train))] # import plotly.offline as py # py.init_notebook_mode(connected=True) # import plotly.graph_objs as go # import plotly.tools as tls trace1 = go.Bar(x=x, y=y, text=pct) layout = dict(title='Number of Items by Main Category', yaxis=dict(title='Count'), xaxis=dict(title='Category')) fig = dict(data=[trace1], layout=layout) py.iplot(fig) # subcat_1类别分布情况 x = train['subcat_1'].value_counts().index.values.astype('str')[:15] y = train['subcat_1'].value_counts().values[:15] pct = [("%.2f" % (v * 100)) + "%" for v in (y / len(train))][:15] trace1 = go.Bar(x=x, y=y, text=pct, marker=dict( color=y, colorscale='Portland', showscale=True, reversescale=False )) layout = dict(title='Number of Items by Sub Category (Top 15)', yaxis=dict(title='Count'), xaxis=dict(title='SubCategory')) fig = dict(data=[trace1], layout=layout) py.iplot(fig) # 运动服装 化妆 女衬衫 general_cats = train['general_cat'].unique() x = [train.loc[train['general_cat'] == cat, 'price'] for cat in general_cats] data = [go.Box(x=np.log(x[i] + 1), name=general_cats[i]) for i in range(len(general_cats))] layout = dict(title="Price Distribution by General Category", yaxis=dict(title='Frequency'), xaxis=dict(title='Category')) fig = dict(data=data, layout=layout) py.iplot(fig) # 品牌名字 print("There are %d unique brand names in the training dataset." % train['brand_name'].nunique()) x = train['brand_name'].value_counts().index.values.astype('str')[:10] y = train['brand_name'].value_counts().values[:10] trace1 = go.Bar(x=x, y=y, marker=dict( color=y, colorscale='Portland', showscale=True, reversescale=False )) layout = dict(title='Top 10 Brand by Number of Items', yaxis=dict(title='Brand Name'), xaxis=dict(title='Count')) fig = dict(data=[trace1], layout=layout) py.iplot(fig) # 商品描述 # 由于它是非结构化数据,因此解析这个特定项目会更具挑战性。 这是否意味着更详细和更长的描述会导致更高的出价? 我们将删除所有标点,删除一些英语停用词(即冗余词,如“a”,“the”等)以及长度小于3的任何其他词: def wordCount(text): # convert to lower case and strip regex try: # convert to lower case and strip regex text = text.lower() regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') txt = regex.sub(" ", text) # tokenize # words = nltk.word_tokenize(clean_txt) # remove words in stop words words = [w for w in txt.split(" ") \ if not w in stop_words.ENGLISH_STOP_WORDS and len(w) > 3] return len(words) except: return 0 # add a column of word counts to both the training and test set train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x)) test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x)) train.head() df = train.groupby('desc_len')['price'].mean().reset_index() # 名字长短与价格有关吗 trace1 = go.Scatter( x=df['desc_len'], y=np.log(df['price'] + 1), mode='lines+markers', name='lines+markers' ) layout = dict(title='Average Log(Price) by Description Length', yaxis=dict(title='Average Log(Price)'), xaxis=dict(title='Description Length')) fig = dict(data=[trace1], layout=layout) py.iplot(fig) train.item_description.isnull().sum() # remove missing values in item description train = train[pd.notnull(train['item_description'])] # create a dictionary of words for each category tokenize = nltk.data.load('tokenizers/punkt/english.pickle') cat_desc = dict() for cat in general_cats: text = " ".join(train.loc[train['general_cat'] == cat, 'item_description'].values) cat_desc[cat] = tokenize.tokenize(text) # flat list of all words combined flat_lst = [item for sublist in list(cat_desc.values()) for item in sublist] allWordsCount = Counter(flat_lst) all_top10 = allWordsCount.most_common(20) x = [w[0] for w in all_top10] y = [w[1] for w in all_top10] trace1 = go.Bar(x=x, y=y, text=pct) layout = dict(title='Word Frequency', yaxis=dict(title='Count'), xaxis=dict(title='Word')) fig = dict(data=[trace1], layout=layout) py.iplot(fig) # 文本数据处理 # # - 分词 # - 去停用词 # - 过滤筛选 stop = set(stopwords.words('english')) def tokenize(text): """ sent_tokenize(): segment text into sentences word_tokenize(): break sentences into words """ try: regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as e: print(text, e) # apply the tokenizer into the item descriptipn column train['tokens'] = train['item_description'].map(tokenize) test['tokens'] = test['item_description'].map(tokenize) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) # 结果打印 for description, tokens in zip(train['item_description'].head(), train['tokens'].head()): print('description:', description) print('tokens:', tokens) print() # 词云展示 # build dictionary with key=category and values as all the descriptions related. cat_desc = dict() for cat in general_cats: text = " ".join(train.loc[train['general_cat'] == cat, 'item_description'].values) cat_desc[cat] = tokenize(text) # find the most common words for the top 4 categories women100 = Counter(cat_desc['Women']).most_common(100) beauty100 = Counter(cat_desc['Beauty']).most_common(100) kids100 = Counter(cat_desc['Kids']).most_common(100) electronics100 = Counter(cat_desc['Electronics']).most_common(100) def generate_wordcloud(tup): wordcloud = WordCloud(background_color='white', max_words=50, max_font_size=40, random_state=42 ).generate(str(tup)) return wordcloud fig, axes = plt.subplots(2, 2, figsize=(30, 15)) ax = axes[0, 0] ax.imshow(generate_wordcloud(women100), interpolation="bilinear") ax.axis('off') ax.set_title("Women Top 100", fontsize=30) ax = axes[0, 1] ax.imshow(generate_wordcloud(beauty100)) ax.axis('off') ax.set_title("Beauty Top 100", fontsize=30) ax = axes[1, 0] ax.imshow(generate_wordcloud(kids100)) ax.axis('off') ax.set_title("Kids Top 100", fontsize=30) ax = axes[1, 1] ax.imshow(generate_wordcloud(electronics100)) ax.axis('off') ax.set_title("Electronic Top 100", fontsize=30) tf - idf from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=10, max_features=180000, tokenizer=tokenize, ngram_range=(1, 2)) all_desc = np.append(train['item_description'].values, test['item_description'].values) vz = vectorizer.fit_transform(list(all_desc)) vz.shape # create a dictionary mapping the tokens to their tfidf values tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) tfidf = pd.DataFrame(columns=['tfidf']).from_dict( dict(tfidf), orient='index') tfidf.columns = ['tfidf'] # 以下是tfidf得分最低的10个标记,这并不令人惊讶,是我们无法区分一种描述与另一种描述的非常通用的词。 tfidf.sort_values(by=['tfidf'], ascending=True).head(10) # 以下是tfidf分数最高的10个标记,其中包含很多具体的词,通过查看它们,我们可以猜出它们所属的分类: tfidf.sort_values(by=['tfidf'], ascending=False).head(10) # 鉴于我们的tfidf矩阵的高维度,我们需要使用奇异值分解(SVD)技术来降低它们的维数。 为了使我们的词汇可视化,我们接下来可以使用t - SNE将维度从50减小到2.t - SNE更适合将维度减少到2或3。 trn = train.copy() tst = test.copy() trn['is_train'] = 1 tst['is_train'] = 0 sample_sz = 15000 combined_df = pd.concat([trn, tst]) combined_sample = combined_df.sample(n=sample_sz) vz_sample = vectorizer.fit_transform(list(combined_sample['item_description'])) from sklearn.decomposition import TruncatedSVD n_comp = 30 svd = TruncatedSVD(n_components=n_comp, random_state=42) svd_tfidf = svd.fit_transform(vz_sample) from sklearn.manifold import TSNE tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=500) tsne_tfidf = tsne_model.fit_transform(svd_tfidf) output_notebook() plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="tf-idf clustering of the item description", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) combined_sample.reset_index(inplace=True, drop=True) tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y']) tfidf_df['description'] = combined_sample['item_description'] tfidf_df['tokens'] = combined_sample['tokens'] tfidf_df['category'] = combined_sample['general_cat'] plot_tfidf.scatter(x='x', y='y', source=tfidf_df, alpha=0.7) hover = plot_tfidf.select(dict(type=HoverTool)) hover.tooltips = {"description": "@description", "tokens": "@tokens", "category": "@category"} show(plot_tfidf) # K - Means聚类 from sklearn.cluster import MiniBatchKMeans num_clusters = 10 # need to be selected wisely kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=0, max_iter=1000) kmeans = kmeans_model.fit(vz) kmeans_clusters = kmeans.predict(vz) kmeans_distances = kmeans.transform(vz) sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() # repeat the same steps for the sample kmeans = kmeans_model.fit(vz_sample) kmeans_clusters = kmeans.predict(vz_sample) kmeans_distances = kmeans.transform(vz_sample) # reduce dimension to 2 using tsne tsne_kmeans = tsne_model.fit_transform(kmeans_distances) # combined_sample.reset_index(drop=True, inplace=True) kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y']) kmeans_df['cluster'] = kmeans_clusters kmeans_df['description'] = combined_sample['item_description'] kmeans_df['category'] = combined_sample['general_cat'] # kmeans_df['cluster']=kmeans_df.cluster.astype(str).astype('category') plot_kmeans = bp.figure(plot_width=700, plot_height=600, title="KMeans clustering of the description", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) kmeans_clusters colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey', '8': 'brown', '9': 'orange'} def get_color(num): if num == 0: return 'red' elif num == 1: return 'green' elif num == 2: return 'blue' elif num == 3: return 'black' elif num == 4: return 'yellow' elif num == 5: return 'pink' elif num == 6: return 'purple' elif num == 7: return 'grey' elif num == 8: return 'brown' elif num == 9: return 'orange' color = pd.Series(kmeans_clusters).apply(get_color) colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow'} source = ColumnDataSource(data=dict(x=kmeans_df['x'], y=kmeans_df['y'], color=color, description=kmeans_df['description'], category=kmeans_df['category'], cluster=kmeans_df['cluster'])) plot_kmeans.scatter(x='x', y='y', color='color', source=source) hover = plot_kmeans.select(dict(type=HoverTool)) hover.tooltips = {"description": "@description", "category": "@category", "cluster": "@cluster"} show(plot_kmeans) # LDA模型 # 输入得是bag # of # words cvectorizer = CountVectorizer(min_df=4, max_features=180000, tokenizer=tokenize, ngram_range=(1, 2)) cvz = cvectorizer.fit_transform(combined_sample['item_description']) lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=20, random_state=42) X_topics = lda_model.fit_transform(cvz) n_top_words = 10 topic_summaries = [] topic_word = lda_model.components_ # get the topic words vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] topic_summaries.append(' '.join(topic_words)) print('Topic {}: {}'.format(i, ' | '.join(topic_words))) # reduce dimension to 2 using tsne tsne_lda = tsne_model.fit_transform(X_topics) unnormalized = np.matrix(X_topics) doc_topic = unnormalized / unnormalized.sum(axis=1) lda_keys = [] for i, tweet in enumerate(combined_sample['item_description']): lda_keys += [doc_topic[i].argmax()] lda_df = pd.DataFrame(tsne_lda, columns=['x', 'y']) lda_df['description'] = combined_sample['item_description'] lda_df['category'] = combined_sample['general_cat'] lda_df['topic'] = lda_keys lda_df['topic'] = lda_df['topic'].map(int) plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey', '8': 'brown', '9': 'orange'} def get_color(num): if num == 0: return 'red' elif num == 1: return 'green' elif num == 2: return 'blue' elif num == 3: return 'black' elif num == 4: return 'yellow' elif num == 5: return 'pink' elif num == 6: return 'purple' elif num == 7: return 'grey' elif num == 8: return 'brown' elif num == 9: return 'orange' color = pd.Series(lda_keys).apply(get_color) source = ColumnDataSource(data=dict(x=lda_df['x'], y=lda_df['y'], color=color, description=lda_df['description'], topic=lda_df['topic'], category=lda_df['category'])) plot_lda.scatter(source=source, x='x', y='y', color='color') hover = plot_kmeans.select(dict(type=HoverTool)) hover = plot_lda.select(dict(type=HoverTool)) hover.tooltips = {"description": "@description", "topic": "@topic", "category": "@category"} show(plot_lda)