这个单子那大哥人确实不怎么样,先是,我说我看下淘宝代码,他就要我便宜20,同意,之后给我们拉单子,一个单子要了10元,之后我和同学吃火锅,没有来得及回消息,就说我做的东西没有达到他的要求,要求我退一些钱,要不是为了店长的好评,我就翻脸了,没办法,谁叫是小菜鸡呢,那个晚上喝了点酒,心情有点不好,居然哭了,呵呵,不说了,看这个代码吧
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait#延迟等待
from selenium.webdriver.support import expected_conditions as EC#查找元素
from selenium.webdriver.common.by import By
import time, re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import UnexpectedAlertPresentException
from time import sleep
import pandas as pd
def search(driver,wait):
try:
Input=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="q"]')))#等待输入框显示
Input.send_keys('口红')
submit=wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="J_TSearchForm"]/div[1]/button')))
submit.click()#点击按钮
tot_page=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="mainsrp-pager"]/div/div/div/div[1]'))).text
print(tot_page)#100页
page_number=int(re.compile('(\d+)').search(tot_page).group(1))
return 44
except:
driver.get('https://login.taobao.com/member/login.jhtml')
sleep(6)
driver.find_element_by_xpath('//*[@class="forget-pwd J_Quick2Static"]').click()
sleep(2)
driver.find_element_by_xpath('//*[@class="weibo-login"]').click()
sleep(3)
driver.find_element_by_name('username').send_keys('1428759813@qq.com')
sleep(5)
driver.find_element_by_name('password').send_keys('aS1233211234567')
sleep(40)
#driver.find_element_by_xpath('//*[@class="btn_tip"]/a/span').click()
search(driver, wait)
return 44
def get_products(browse):
divs = browse.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]') # 获取标签
for div1 in divs:
price = div1.find_element_by_xpath('.//div[@class="row row-1 g-clearfix"]/div[@class="price g_price g_price-highlight"]').text
all_price.append(price)
info = div1.find_element_by_xpath('.//div[@class="row row-2 title"]/a[@class="J_ClickStat"]').text
all_info.append(info)
producer = div1.find_element_by_xpath('.//div[@class="row row-3 g-clearfix"]/div[@class="location"]').text
all_producer.append(producer)
deal = div1.find_element_by_xpath('.//div[@class="row row-1 g-clearfix"]/div[@class="deal-cnt"]').text
all_deal.append(deal)
dic={'商品信息':info,'销售价格':price,'发货地':producer,'销售额':deal}
print(dic)
def digging(driver,page):
num=24
while num!=page-1:
driver.get('https://s.taobao.com/search?q=口红&s={}'.format(44*num))
driver.implicitly_wait(10)
get_products(driver)
num += 1
sleep(7)
if __name__ == "__main__":
try:
all_info = []
all_deal = []
all_price = []
all_producer = []
driver=webdriver.Chrome()
wait = WebDriverWait(driver, 15)
driver.get('https://www.taobao.com/')
driver.maximize_window()
page = search(driver,wait)
print(type(page))
digging(driver,page)
print(all_info,all_price,all_producer,all_deal)
lipstick=pd.DataFrame()
lipstick['商品信息']=all_info
lipstick['销售价格']=all_price
lipstick['发货地']=all_producer
lipstick['销售额']=all_deal
finally:
lipstick.to_excel('lipstick.xls2',index=False)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False
test=pd.read_excel('lipstick.xls')
test.head()
info | price | producer | sales | |
---|---|---|---|---|
0 | 【圣诞礼物】MAC/魅可子弹头断货王口红唇膏 chili/牛血色/mac646 | ¥170.00 | 浙江 杭州 | 10万+人付款 |
1 | 【圣诞礼物】MAC/魅可尤雾弹唇膏哑光口红316/923人间水蜜桃新款 | ¥170.00 | 浙江 杭州 | 10万+人付款 |
2 | Dior/迪奥圣诞星空限量口红套装烟花礼盒蓝金唇膏口红999 772 080 | ¥1368.00 | 福建 福州 | 209人付款 |
3 | Christian Louboutin萝卜丁女王限量口红3支装001/001s/001m 3.5g | ¥2508.00 | 福建 福州 | 26人付款 |
4 | ChristianLouboutin萝卜丁进口女王权杖口红唇膏多色滋润保湿持久 | ¥788.00 | 浙江 杭州 | 1057人付款 |
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 4 columns):
info 1059 non-null object
price 1059 non-null object
producer 1059 non-null object
sales 1053 non-null object
dtypes: object(4)
memory usage: 33.2+ KB
#缺失值
test.isnull().sum()
test.dropna(inplace=True)
数据预处理
test['price']=test['price'].apply(lambda x: x.split('¥')[1])
import re
test['sales'] = test['sales'].apply(lambda x: re.findall(r"\d+\.?\d*",str(x)))
for i in test.index:
test.loc[i,'sales']=test['sales'][i][0]
词云
import codecs
import jieba
import pickle
# test['info'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次运行程序时将分好的词存入文件
# text = ''
# with open('名称.txt',encoding = 'utf-8') as fin:
# for line in fin.readlines():
# line = line.strip('\n')
# text += ' '.join(jieba.cut(line))
# text += ' '
# fout = open('text.txt','wb')
# pickle.dump(text,fout)
# fout.close()
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('lipstick.jpg')
wc = WordCloud( background_color = 'white', # 设置背景颜色
mask = backgroud_Image, # 设置背景图片
max_words = 200, # 设置最大现实的字数
stopwords = STOPWORDS, # 设置停用词
font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
max_font_size = 200, # 设置字体最大值
random_state = 8, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(20,30))
plt.imshow(wc)
plt.axis('off')
plt.show()
商品价格对销售额的影响
test['sales']=test['sales'].astype('float')
test['sales']=test['sales'].astype('float')
test.head()
info | price | producer | sales | |
---|---|---|---|---|
0 | 【圣诞礼物】MAC/魅可子弹头断货王口红唇膏 chili/牛血色/mac646 | 170.00 | 浙江 杭州 | 10.0 |
1 | 【圣诞礼物】MAC/魅可尤雾弹唇膏哑光口红316/923人间水蜜桃新款 | 170.00 | 浙江 杭州 | 10.0 |
2 | Dior/迪奥圣诞星空限量口红套装烟花礼盒蓝金唇膏口红999 772 080 | 1368.00 | 福建 福州 | 209.0 |
3 | Christian Louboutin萝卜丁女王限量口红3支装001/001s/001m 3.5g | 2508.00 | 福建 福州 | 26.0 |
4 | ChristianLouboutin萝卜丁进口女王权杖口红唇膏多色滋润保湿持久 | 788.00 | 浙江 杭州 | 1057.0 |
tes1t=test.sort_values(by='price',ascending=True)
tes1t.head()
info | price | producer | sales | |
---|---|---|---|---|
795 | 正品MAC魅可棒棒糖唇釉唇彩染唇液镜面口红 18新款泫雅色106/108 | 101.00 | 上海 | 3077.0 |
789 | 正品MAC魅可棒棒糖唇釉唇彩染唇液镜面口红 18新款泫雅色106/108 | 101.00 | 上海 | 3077.0 |
347 | 【双旦礼遇季】珂莱欧炫彩丝绒雾面唇釉唇彩持久滋润显色口红新品 | 102.00 | 上海 | 204.0 |
1013 | 魅可MAC圣诞星空限定口红923牛血色316车厘子色646小辣椒大牌正品 | 102.00 | 山东 济南 | 2641.0 |
140 | 第二支10元意大利公主钻石品质魔镜口红ROSEPRETTY贵族限量版 | 1029.90 | 江苏 南京 | 1283.0 |
import seaborn as sns
sns.relplot(x="price", y="sales", data=test,
kind='scatter', # ['scatter','line']
# hue='day', # 设置按颜色分类的第三变量
# style='day', # 设置形状分类
palette='husl',s=60, # 设置调色盘类型和散点大小
aspect=2.5,height=6 # 设置图像大小和横纵比
)
<seaborn.axisgrid.FacetGrid at 0x21e7d38deb8>
import seaborn as sns
sns.relplot(x="sales", y="price", data=test,
kind='scatter', # ['scatter','line']
# hue='day', # 设置按颜色分类的第三变量
# style='day', # 设置形状分类
palette='husl',s=60, # 设置调色盘类型和散点大小
aspect=2.5,height=6 # 设置图像大小和横纵比
)
<seaborn.axisgrid.FacetGrid at 0x21e03a96860>
不同价格区间的商品平均销量分布
test['sales'].describe()
count 1053.000000
mean 1334.559354
std 1910.790570
min 0.000000
25% 136.000000
50% 526.000000
75% 1645.000000
max 9500.000000
Name: sales, dtype: float64
def function(x):
if x<136:
return 1
elif 526>x>136:
return 2
elif 1645>x>526:
return 3
elif x>1645:
return 4
test['label'] = test['sales'].apply(lambda x: function(x))
tips = sns.load_dataset("tips")
ax = sns.stripplot(x="label", y="sales", data=test)
for i in range(1,5):
print('第{}类的商品平均销量为{}'.format(i,int((test['sales'][test['label']==i]).mean())))
第1类的商品平均销量为44
第2类的商品平均销量为306
第3类的商品平均销量为958
第4类的商品平均销量为4027
商品的价格分布情况分析
#这是一个密度图,你看两边都趋近零,但是最高的是0-1000中间,说明主要分布在这里
test['sales'].dropna().plot(kind='kde', xlim=(-2000,7500))
<matplotlib.axes._subplots.AxesSubplot at 0x21e027fc630>
labels = ['1','2','3','4']
sizes=[]
for i in range(1,5):
sizes.append(int((test['sales'][test['label']==i]).count()))
explode = (0,0,0.1,0)
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)
plt.title("商品的价格分布")
plt.show()