一、介绍
- 本爬虫功能主要是模拟浏览器操作,获取淘宝ajax数据,爬取一些基本的口红销量与价格。
- 附加功能1:将数据绘制成三种图像(饼状图,柱状图,拟合曲线)。
- 附加工能2:将数据导出为文本和表格。
- 需要手动操作1:第一次爬取数据需要扫码登录一下淘宝,之后就不用了.
- 需要手动操作2:需要自己下载一下edge的驱动,在我前面文章爬虫的模拟浏览器登录有网址
二、代码
import os
from lxml import etree
from selenium import webdriver
import time
import json
from xlwt import *
import numpy as np
from scipy import interpolate
import matplotlib.pyplot as plt
bro = webdriver.Edge('./msedgedriver.exe')
cookie_path = "./cookies.txt"
url = "https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&sort=sale-desc&s="
product_list = []
total_page = 5
product_price = []
product_sales_volume = []
def set_cookies():
"""
获取cookies保存至本地
:return:
"""
dict_cookies = bro.get_cookies()
json_cookies = json.dumps(dict_cookies)
with open(cookie_path, 'w') as f:
f.write(json_cookies)
print('cookies保存成功')
def get_cookies(url):
"""
从本地读取cookies并登录目标网页
:param url: 要登录的网页
:return:
"""
with open(cookie_path, 'r', encoding='utf8') as f:
list_cookies = []
list_cookies = json.loads(f.read())
print(len(list_cookies))
for cookie in list_cookies:
print(str(cookie))
if cookie["domain"] != "s.taobao.com":
bro.add_cookie(cookie)
time.sleep(2)
bro.get(url)
time.sleep(3)
def parse_label():
"""
解析标签
:return:
"""
div_list = tree.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
for div in div_list:
name = div.xpath("string(./div[2]/div[2]/a)")
price = div.xpath("./div[2]/div[1]/div[1]/strong/text()")[0]
sales_volume = div.xpath("./div[2]/div[1]/div[2]/text()")
content = {"名称": " ", "价格": " ", "销量": " "}
content["名称"] = name
content["价格"] = price
content["销量"] = sales_volume
product_list.append(content)
def get_index_arr(arr, e):
"""
获取元素在数组中的位置,不在数组则返回-1
:param arr: 数组
:param e: 元素
:return:
"""
for index, value in enumerate(arr):
if e == value:
return index
return -1
def analysis_data():
"""
分析数据
:return:
"""
for product in product_list:
product["名称"] = product["名称"].replace("\n", "").replace(" ", "")
if len(product["销量"]) > 0:
product["销量"] = product["销量"][0].replace("万", "0000").split("+")[0]
product["销量"] = product["销量"].replace("人收货", "")
else:
product["销量"] = "0"
index = get_index_arr(product_price, float(product["价格"]))
if index == -1:
product_price.append(float(product["价格"]))
product_sales_volume.append(int(product["销量"]))
else:
product_sales_volume[index] = (product_sales_volume[index] + int(product["销量"]))
print(product)
def save_txt():
"""
保存数据到txt
:return:
"""
if os.path.exists("./product.txt"):
os.remove("./product.txt")
for product in product_list:
with open("./product.txt", "a", encoding='utf-8') as fp:
fp.write(str(product) + "\n")
def save_excel():
"""
保存数据到Excel
:return:
"""
if os.path.exists("./product.xlsx"):
os.remove("./product.xlsx")
table_title = ["名称", "价格", "销量"]
file = Workbook(encoding='utf-8')
table = file.add_sheet('product')
for i in range(0, len(product_list)):
message_i = product_list[i]
if i == 0:
for j in range(0, len(table_title)):
table.write(0, j, table_title[j])
for j in range(0, len(message_i)):
message = ""
if j == 0:
message = message_i["名称"]
elif j == 1:
message = message_i["价格"]
else:
message = message_i["销量"]
table.write(i + 1, j, message)
file.save('./product.xlsx')
def print_curve():
"""
画曲线
:return:
"""
x = np.array(product_price)
y = np.array(product_sales_volume)
x_new = np.arange(min(x), max(x), 0.5)
"""
nearest、zero、slinear、quadratic、cubic
实现函数func
"""
func = interpolate.interp1d(x, y, kind='cubic')
y_new = func(x_new)
plt.plot(x_new, y_new)
plt.show()
def print_histogram():
"""
画直方图
:return:
"""
x = product_price
y = product_sales_volume
plt.bar(x, y, 5, color="green")
plt.xlabel("X-price")
plt.ylabel("Y-sales_volume")
plt.title("price--sales_volume")
plt.show()
def print_pie_chart():
"""
画饼状图
:return:
"""
labels = product_price
quants = product_sales_volume
plt.figure(1, figsize=(6, 6))
exp = []
for i in range(0, len(product_price)):
if i < 3:
exp.append(0.1)
else:
exp.append(0)
colors = ["blue", "red", "coral", "green", "yellow", "orange"]
plt.pie(quants, explode=exp, colors=colors, labels=labels, autopct='%1.1f%%', pctdistance=0.8, shadow=True)
plt.title('Pie chart of lipstick prices and sales', bbox={'facecolor': '0.8', 'pad': 5})
plt.show()
if __name__ == "__main__":
for page in range(0, total_page):
get_url = url + str(page * 44)
bro.get(get_url)
if page == 0:
if os.path.exists(cookie_path):
get_cookies(get_url)
else:
time.sleep(10)
set_cookies()
page_text = bro.page_source
tree = etree.HTML(page_text)
print("\n\n第" + str(page + 1) + "次爬取网页,分析信息")
print("开始解析标签")
parse_label()
print("开始分析数据")
analysis_data()
print("开始存储数据到Excel")
save_excel()
print("开始存储数据到txt")
save_txt()
print("画曲线")
print_curve()
print("画柱状图")
print_histogram()
print("画饼状图")
print_pie_chart()
print("\n\n\n数据爬完了\n\n\n")
bro.quit()