淘宝商品口红数据爬取与分析

最新推荐文章于 2024-06-30 12:28:37 发布

进步小白

最新推荐文章于 2024-06-30 12:28:37 发布

阅读量2.6k

点赞数 11

分类专栏：爬虫数据分析文章标签： python 数据分析大数据爬虫

本文链接：https://blog.csdn.net/am_student/article/details/122018655

版权

爬虫同时被 2 个专栏收录

6 篇文章 2 订阅

订阅专栏

数据分析

6 篇文章 2 订阅

订阅专栏

数据来源：

爬取数据，网盘中包含爬取的数据与停词库

百度网盘请输入提取码 6666

处理过程

导入数据：

import pandas as pd
data1 = pd.read_excel("kouhong_good.xlsx")
data1.head()

data1.drop(['comment_url'],axis= 1,inplace = True)

将数据店铺分类：

def store(e):
    if '天猫' in e:
        return '天猫店铺'
    elif '旗舰店' in e:
        return '旗舰店'
    elif '专营店' in e:
        return '专营店'
    elif '企业店' in e:
        return '企业店铺'
    else:
        return '自营店铺'

data1['store_type'] = data1['store'].apply(store)
data1.drop(['store'],axis = 1,inplace = True)
data1.head()

处理销量与价格：

import re
def delete(e):
    if '人收货' in e:
        return e.replace('人收货','')
def price(e):
    if '万+' in e:
        num1 = re.findall('(.*?)万+',e)
        return float(num1[0])*10000
    elif '+' in e:
        return e.replace('+','')
    else:
        return float(e)
data1['store_sales'] = data1['sales'].apply(delete).apply(price)
data1.drop(['sales'],axis = 1,inplace = True)
data1.head()

品牌分类：

def classify(e):
    if'Mac' in e:
        return 'MAC'
    elif'魅可'in e:
        return 'MAC'
    elif'Dior'in e:
        return 'Dior'
    elif 'Givenchy'or'纪梵希' in e:
        return 'Givenchy'
    else :
        return 'Others'
data1['brand'] = data1['title'].apply(classify)

data1['brand'] = data1['title'].apply(classify)
data1.head(20)

处理商铺地点：

def location(e):
    return e.split(' ')[0]
data1['store_location'] = data1['location'].apply(location)
data1.drop(['location'],axis = 1,inplace = True)
data1.head(5)

处理价格，删去不合理价格

list = data1[data1['price']<51].index.tolist()
print(list)

data1.drop([54, 93, 104, 162, 173, 457, 500, 541, 551, 654, 674, 685, 705, 726, 789, 823, 837, 847, 851, 949, 956, 1061, 1127, 1128, 1130, 1136, 1137, 1151, 1175, 1193, 1241, 1269, 1308, 1323, 1360, 1380, 1388, 1407, 1459, 1462, 1479, 1483, 1503, 1531, 1544, 1553, 1558, 1572, 1589, 1590, 1624, 1630, 1673, 1703, 1721, 1726, 1779, 1791, 1798, 1812, 1852, 1862, 1935, 1945],inplace = True)
data1

data1['store_sales'] = data1['store_sales'].astype(int)

data1['sales_money'] = data1['price']*data1['store_sales']
data1

品牌占比：

b = [b[0]/m,b[1]/m,b[2]/m]
print(b)

from pyecharts import Pie
pie = Pie("口红品牌比例",width = 600,height = 400)
pie.add("", a, b, is_label_show=True)
pie.render('1.html')

data1['price'].groupby(data1['brand']).sum()

brand_mean = round(data1['price'].groupby(data1['brand']).mean(),1)
brand_mean

brand_median = data1['price'].groupby(data1['brand']).median()
brand_median

不同价格概况：

from pyecharts import Bar
name = ['Dior','Givenchy','MAC']
bar = Bar("不同品牌价格概况",width = 600,height = 400)
bar.add('平均价格',name,brand_mean,is_label_show = True,
        xaxis_label_textsize = 25,yaxis_label_textsize = 15)#,xaxis_rotate = 30
bar.add('中位价格',name,brand_median,is_label_show = True,
        xaxis_label_textsize = 25,yaxis_label_textsize = 15)
bar.render('2.html')

all_sale = round(data1['sales_money'].groupby(data1['brand']).sum(),1)
all_sale

data2 = data1.pivot_table(values=['sales_money'], index=['brand', 'store_type'])
data2

def transform(e):
    return int(e)
data2['sale_money'] = data2['sales_money'].apply(transform)
data2.drop(['sales_money'],axis = 1,inplace = True)

name2 =["Dior", "Givenchy", "MAC"]

bar1=Bar('不同品牌销售额概况',width = 800,height = 500)
bar1.add('',name2,all_sale,is_label_show = True)
bar1.render('3.html')

bar2=Bar('不同品牌之不同店铺销售额',width = 600,height = 400)
bar2.add('专营店',name2,[561673,1683,0],is_label_show = True)
bar2.add('企业店铺',name2,[74099,152569,16789],is_label_show = True)
bar2.add('天猫店铺',name2,[224795,0,0],is_label_show = True)
bar2.add('旗舰店',name2,[165455,1103268,2657514],is_label_show = True)
bar2.add('自营店铺',name2,[19276,12157,10958],is_label_show = True)
bar2.render('4.html')

地点处理：

data1['store_location'].value_counts()

location =['广东','上海','浙江','山东','北京','江苏','辽宁','湖南','福建','四川','黑龙江', '湖北', '河南', '安徽', '河北', '江西','吉林', '香港','陕西', '天津', '重庆', '山西', '云南', '广西','贵州','海南', '新疆']
number = [414, 305, 236, 180, 171, 144, 72, 62, 62, 43, 32, 28, 28, 22, 20, 19, 14, 14, 13, 12, 6, 4, 3, 2, 1, 1, 1]

from pyecharts import Map
map0 = Map("店铺地址分布图",width=800, height=600)
map0.add("", location, number, visual_range=[0, 414],maptype="china", is_visualmap=True, visual_text_color='#000',is_label_show=True)
map0.render("5.html")

data1['title'] = data1['title'].astype(str)
import jieba
import jieba.analyse

def cut_word(text):
    text = jieba.cut(str(text),cut_all=False)
    return ' '.join(text)

data1['new_title'] = data1['title'].apply(cut_word)
data1

with open(r'.\words.txt','r',encoding='utf-8')as f:
    print(f)
    words = f.read()
    f.close
print(words)

关键词占比：

jieba.analyse.set_stop_words(r'.\stoplist.txt')
new_words = jieba.analyse.textrank(words, topK=20, withWeight=True)
print(new_words)

last_words = []
for i in range(20):
    a = new_words[i][0]
    last_words.append(a)
print(last_words)
last_rank = []
for i in range(20):
    b = new_words[i][1]
    last_rank.append(b)
print(last_rank)

制作词云：

from pyecharts import WordCloud
wordcloud=WordCloud(width=600,height=400)
wordcloud.add('',last_words,last_rank,word_size_range=[20,100])
wordcloud.render("7.html")

data2 = pd.read_csv('Dior_kouhong_data.csv')
data3 = pd.read_csv('Givenchy_kouhong_data.csv')
data4 = pd.read_csv('mac_kouhong_data.csv')
print(data2)
print(data3)
print(data4)

from snownlp import SnowNLP
data = pd.concat([data2,data3,data4],axis = 0)
data = data.reset_index(drop=True)
data

data = data[~data['口红评价'].isin(['此用户没有填写评论!'])]
the_data = data
data

def length(e):
    if len(e)<=10:
        return '0'
    else :
        return '1'

data['num'] = data['口红评价'].apply(length)
data

data = data[~data['num'].isin(['0'])]
data.drop(['num'],axis = 1,inplace = True)
data = data.reset_index(drop=True)
data

data['评价'] = data['口红评价'].apply(cut_word)
data

with open(r'.\words2.txt','r',encoding='utf-8')as f:
    words2 = f.read()
    f.close
print(words2)

last_words2 = []
for i in range(20):
    a = new_words2[i][0]
    last_words2.append(a)
print(last_words2)
last_rank2 = []
for i in range(20):
    b = new_words2[i][1]
    last_rank2.append(b)
print(last_rank)

云图：

wordcloud=WordCloud(width=600,height=400)
wordcloud.add('',last_words2,last_rank2,word_size_range=[20,100])
wordcloud.render("8.html")

new_data = data.drop(['评价'],axis = 1)
new_data

re模块寻找关键词：

import re
k_list = []
last_list = []
keyword = input('请输入想查找的关键词：')
for i in new_data['口红评价']:
    if keyword in i :
        k_list.append(i)
for j in k_list:
    a = new_data[new_data['口红评价'].isin([j])].index.tolist()[0]
    last_list.append(a)
new_data.loc[last_list]

last_data = pd.concat([data2,data3,data4],axis = 0)
last_data = last_data.reset_index(drop=True)
last_data

def the_brand(e):
    if 'Dior' in e:
        return 'Dior'
    elif 'Givenchy' in e:
        return 'Givenchy'
    elif 'Mac' in e:
        return 'Mac'
def the_del(e):
     if 'Dior' in e:
        return e.replace('Dior:','')
     elif 'Givenchy' in e:
        return e.replace('Givenchy:','')
     elif 'Mac' in e:
        return e.replace('Mac:','')

last_data['the_brand'] = last_data['颜色Color'].apply(the_brand).apply(the_del)
last_data['Color'] = last_data['颜色Color'].apply(the_del)
#last_data.drop(['颜色Color'],axis = 1,inplace = True)
last_data

last_data['the_brand'].value_counts()

m = last_data['颜色Color'].value_counts().head(10).index.tolist()
n = last_data['颜色Color'].value_counts().head(10).values.tolist()
print(m)
print(n)

bar = Bar('不同色号售卖概况',width = 1000,height = 500,title_text_size = 25)
bar.add('',m,n,is_convert = True)
bar.render('10.html')

再次引入，准备情感分析：

df1 = pd.read_csv('Dior_kouhong_data.csv')
df2= pd.read_csv('Givenchy_kouhong_data.csv')
df3 = pd.read_csv('mac_kouhong_data.csv')

data['emotion'] = data['口红评价'].apply(lambda x:SnowNLP(x).sentiments)
data.head()#积极情感分析

df1['emotion'] = df1['口红评价'].apply(lambda x:SnowNLP(x).sentiments)
data2.head()#积极情感分析

df2['emotion'] =df2['口红评价'].apply(lambda x:SnowNLP(x).sentiments)
df3['emotion'] = df3['口红评价'].apply(lambda x:SnowNLP(x).sentiments)
print(df2['emotion'])
print(df3['emotion'])

df1['emotion'].describe()
#emotion的均值是0.871

import matplotlib.pyplot as plt 
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
bins=np.arange(0,1.1,0.1)
plt.hist(df1['emotion'],bins,color='#4F94CD',alpha=0.9)
plt.xlim(0,1)
plt.xlabel('情感分析')
plt.ylabel('数量')
plt.title('迪奥情感分析直方图')
plt.show()

import matplotlib.pyplot as plt 
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
bins=np.arange(0,1.1,0.1)
plt.hist(df2['emotion'],bins,color='#4F94CD',alpha=0.9)
plt.xlim(0,1)
plt.xlabel('情感分析')
plt.ylabel('数量')
plt.title('纪梵希情感分析直方图')
plt.show()

import matplotlib.pyplot as plt 
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
bins=np.arange(0,1.1,0.1)
plt.hist(df3['emotion'],bins,color='#4F94CD',alpha=0.9)
plt.xlim(0,1)
plt.xlabel('情感分析')
plt.ylabel('数量')
plt.title('魅可情感分析直方图')
plt.show()

pos_nlp1 = 0
neg_nlp1 = 0
for i in df1['emotion']:
    if i >= 0.5:
        pos_nlp1 += 1
    else:
        neg_nlp1 += 1
print('积极评论，消极评论数目分别为：',pos_nlp1,neg_nlp1)

查看占比：

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pie_labels='postive','negative'
plt.pie([pos_nlp1,neg_nlp1],labels=pie_labels,autopct='%1.1f%%',shadow=True)
plt.show()

其他如此。

#评论的长短可以看出评论者的认真程度
import seaborn as sns
df1['认真程度'] = df1['口红评价'].str.len()
fig2, ax2=plt.subplots()
sns.scatterplot(x='emotion',y='认真程度',data=df1, ax=ax2)
ax2.set_ylim(0,300)

进行特征处理，对模型进行评分：

with open(r'stoplist.txt', encoding='utf-8') as file:
    word_list = [x.strip() for x in file.readlines()]

def SetLabel(score):
    if score >=0.6:
        return 1 
    else:
        return 0
data['emotion'] = data['emotion'].map(lambda x:SetLabel(x))

#数据集拆分为语料、标签
terms = data['口红评价'].tolist()
y = data['emotion'].tolist()
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
# 初始化TFIV对象，去停用词，加2元语言模型  
tfv = TFIV(min_df=3,  max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
           ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = word_list)  

tfv.fit(terms)
X_all = tfv.transform(terms)

#特征选择
from sklearn.feature_selection import SelectKBest, chi2
select_feature_model = SelectKBest(chi2, k=100)    
##卡方检验来选择100个最佳特征
X_all = select_feature_model.fit_transform(X_all, y)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_all, y, random_state=0, test_size=0.25)

from sklearn.naive_bayes import MultinomialNB as MNB
model_NB = MNB()
model_NB.fit(x_train, y_train) 
MNB(alpha=1.0, class_prior=None, fit_prior=True) 

from sklearn.model_selection import cross_val_score
#评估预测性能，减少过拟合
print("贝叶斯分类器20折交叉验证得分: ", np.mean(cross_val_score(model_NB, x_train, y_train, cv=20, scoring='roc_auc')))

贝叶斯分类器20折交叉验证得分:  0.675626876876877

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV
model_LR = LogisticRegression(C=.01) # C是正则化系数。
model_LR.fit(x_train, y_train)
print("20折交叉验证得分: ", np.mean(cross_val_score(model_LR, x_train, y_train, cv=20, scoring='roc_auc')))

20折交叉验证得分:  0.7045076326326327

from sklearn.svm import LinearSVC
model_SVM = LinearSVC(C=.01) # C是正则化系数。
model_SVM.fit(x_train, y_train)
print("20折交叉验证得分: ", np.mean(cross_val_score(model_SVM, x_train, y_train, cv=20, scoring='roc_auc')))

20折交叉验证得分:  0.701445820820821

欢迎大家订正与修改讨论，总过程代码放在项目中可以下载学习。

进步小白

关注

11
点赞
踩
75

收藏

觉得还不错? 一键收藏
打赏
2
评论
淘宝商品口红数据爬取与分析

淘宝分析口红，选择最优结果，来自程序员的浪漫~
复制链接

扫一扫

专栏目录

淘宝商品口红数据爬取与分析

数据来源：

处理过程

“相关推荐”对你有帮助么？