python爬虫 -- 模拟浏览器获取淘宝商品数据

墨_风

已于 2023-09-13 15:25:18 修改

阅读量1k

点赞数 3

分类专栏：学习笔记文章标签： python 爬虫开发语言

于 2022-02-13 15:33:37 首次发布

本文链接：https://blog.csdn.net/mocoll/article/details/122909596

版权

学习笔记同时被 2 个专栏收录

207 篇文章 1 订阅

订阅专栏

编程语言

98 篇文章 1 订阅

订阅专栏

本文介绍了如何使用Python爬虫模拟浏览器获取淘宝口红的销量和价格数据，并展示了数据的饼图、柱状图和拟合曲线，同时提供了数据导出为文本和表格的功能。首次爬取需扫码登录，后续自动使用已存储的cookie。

摘要由CSDN通过智能技术生成

一、介绍

本爬虫功能主要是模拟浏览器操作，获取淘宝ajax数据，爬取一些基本的口红销量与价格。
附加功能1：将数据绘制成三种图像（饼状图，柱状图，拟合曲线）。
附加工能2：将数据导出为文本和表格。
需要手动操作1：第一次爬取数据需要扫码登录一下淘宝，之后就不用了.
需要手动操作2：需要自己下载一下edge的驱动，在我前面文章爬虫的模拟浏览器登录有网址

二、代码

import os
from lxml import etree
from selenium import webdriver
import time
import json
from xlwt import *

import numpy as np
# 实现插值的模块
from scipy import interpolate
# 画图的模块
import matplotlib.pyplot as plt
# 按照驱动路径获取驱动
bro = webdriver.Edge('./msedgedriver.exe')

# 存储cookie
cookie_path = "./cookies.txt"
# 访问地址
url = "https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&sort=sale-desc&s="
# 存储产品的集合
product_list = []
# 爬取网页的页数
total_page = 5
# 所有的价格的集合 画图用
product_price = []
# 所有的销量的集合 画图用 与价格集合一一对应
product_sales_volume = []


def set_cookies():
    """
    获取cookies保存至本地
    :return:
    """
    dict_cookies = bro.get_cookies()  # 获取list的cookies
    json_cookies = json.dumps(dict_cookies)  # 转换成字符串保存
    with open(cookie_path, 'w') as f:
        f.write(json_cookies)
    print('cookies保存成功')


def get_cookies(url):
    """
    从本地读取cookies并登录目标网页
    :param url: 要登录的网页
    :return:
    """
    with open(cookie_path, 'r', encoding='utf8') as f:
        list_cookies = []
        list_cookies = json.loads(f.read())
        print(len(list_cookies))
        for cookie in list_cookies:
            print(str(cookie))
            if cookie["domain"] != "s.taobao.com":
                bro.add_cookie(cookie)
        time.sleep(2)
        bro.get(url)
        time.sleep(3)


def parse_label():
    """
    解析标签
    :return:
    """
    # 通过解析标签，锁定所有装化妆品信息的盒子
    div_list = tree.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
    for div in div_list:
        # 以下是分析化妆品盒子中的每一个信息
        name = div.xpath("string(./div[2]/div[2]/a)")  # sting()包裹表示，只提取这个标签下的文字部分，他的子标签不管
        price = div.xpath("./div[2]/div[1]/div[1]/strong/text()")[0]  # text()表示，提取这个标签下的文字部分，会返回数组
        sales_volume = div.xpath("./div[2]/div[1]/div[2]/text()")
        content = {"名称": " ", "价格": " ", "销量": " "}
        content["名称"] = name
        content["价格"] = price
        content["销量"] = sales_volume
        # 将每一个产品装在数组里面
        product_list.append(content)


def get_index_arr(arr, e):
    """
    获取元素在数组中的位置，不在数组则返回-1
    :param arr: 数组
    :param e: 元素
    :return:
    """
    for index, value in enumerate(arr):
        if e == value:
            return index
    return -1


def analysis_data():
    """
    分析数据
    :return:
    """
    for product in product_list:
        # 去掉名称里面的换行符和空格
        product["名称"] = product["名称"].replace("\n", "").replace(" ", "")
        # 有的产品没有销量 没销量的值[]，换成空字符，有销量的值['销量'],提取出销量的字符串
        if len(product["销量"]) > 0:
            # 将万字换成0000，然后根据+号拆分字符串，取出前面的数字部分
            # 举例 6万+人下单  变成 60000+人下单 然后拆分为  ['60000','人下单']  取出数组第0位  60000
            product["销量"] = product["销量"][0].replace("万", "0000").split("+")[0]
            # 43人收货 这种格式的  直接去掉 “人收货”
            product["销量"] = product["销量"].replace("人收货", "")
        else:
            product["销量"] = "0"

        # 分析售价  有一样的就销量叠加
        index = get_index_arr(product_price, float(product["价格"]))
        if index == -1:
            product_price.append(float(product["价格"]))
            product_sales_volume.append(int(product["销量"]))
        else:
            product_sales_volume[index] = (product_sales_volume[index] + int(product["销量"]))
        print(product)


def save_txt():
    """
    保存数据到txt
    :return:
    """
    if os.path.exists("./product.txt"):
        os.remove("./product.txt")
    for product in product_list:
        with open("./product.txt", "a", encoding='utf-8') as fp:
            fp.write(str(product) + "\n")


def save_excel():
    """
        保存数据到Excel
    :return:
    """
    if os.path.exists("./product.xlsx"):
        os.remove("./product.xlsx")
    table_title = ["名称", "价格", "销量"]
    file = Workbook(encoding='utf-8')
    # 指定打开的文件名
    table = file.add_sheet('product')
    for i in range(0, len(product_list)):
        message_i = product_list[i]
        if i == 0:
            for j in range(0, len(table_title)):
                table.write(0, j, table_title[j])
        for j in range(0, len(message_i)):
            message = ""
            if j == 0:
                message = message_i["名称"]
            elif j == 1:
                message = message_i["价格"]
            else:
                message = message_i["销量"]
            table.write(i + 1, j, message)
    file.save('./product.xlsx')


def print_curve():
    """
    画曲线
    :return:
    """
    # x是一个数组，表示x轴的值 表示价格
    x = np.array(product_price)
    # # y是一个数组，表示y轴的值 表示销量
    y = np.array(product_sales_volume)
    # 插值法之后的x轴值，表示从0到500间距为0.5的1000个数
    x_new = np.arange(min(x), max(x), 0.5)
    """
    nearest、zero、slinear、quadratic、cubic
    实现函数func
    """
    func = interpolate.interp1d(x, y, kind='cubic')
    # # 利用x_new和func函数生成y_new，x_new的数量等于y_new数量
    y_new = func(x_new)
    # 画图部分

    # 原图
    # plt.plot(x, y, 'ro-')

    # 拟合之后的平滑曲线图
    plt.plot(x_new, y_new)
    plt.show()


def print_histogram():
    """
    画直方图
    :return:
    """
    x = product_price
    y = product_sales_volume
    # left: 每一个柱形左侧的X坐标 height:每一个柱形的高度 width: 柱形之间的宽度 bottom: 柱形的Y坐标 color: 柱形的颜色
    plt.bar(x, y, 5, color="green")
    # 不支持中文的
    # x轴的提示
    plt.xlabel("X-price")
    # y轴的提示
    plt.ylabel("Y-sales_volume")
    # 标题
    plt.title("price--sales_volume")
    plt.show()


def print_pie_chart():
    """
    画饼状图
    :return:
    """
    # 标签数据  就是圈外数据
    labels = product_price
    # 圈内的数据  最后会转化为百分比
    quants = product_sales_volume
    # make a square figure
    plt.figure(1, figsize=(6, 6))
    exp = []
    # 以下设置就是为了突出前三名
    for i in range(0, len(product_price)):
        if i < 3:
            # 前三块即销量最高的前三个离开圆心0.1
            exp.append(0.1)
        else:
            exp.append(0)
    # Colors used Recycle if not enough
    colors = ["blue", "red", "coral", "green", "yellow", "orange"]  # 设置颜色（循环显示）
    # autopct: format of "percent" string;百分数格式
    plt.pie(quants, explode=exp, colors=colors, labels=labels, autopct='%1.1f%%', pctdistance=0.8, shadow=True)
    # 设置标题
    plt.title('Pie chart of lipstick prices and sales', bbox={'facecolor': '0.8', 'pad': 5})
    plt.show()


if __name__ == "__main__":
    # 循环101次，爬取101页数据
    for page in range(0, total_page):
        # 访问当前网址
        get_url = url + str(page * 44)  # page * 44 是淘宝分页请求规则
        bro.get(get_url)

        # 爬取第一页，就是打开浏览器后 需要验证身份
        # 所以从cookie中获取身份信息 提交后 就能进行后续的信息爬取
        # 所以 下面代码相当于验证身份
        if page == 0:
            # os.path.exists(cookie_path) 判断此路径文件是否存在 存在返回 True
            if os.path.exists(cookie_path):
                get_cookies(get_url)
            else:
                # 如果没有cookie文件 则需要手动扫码登录
                # 暂停10秒，扫码登录
                time.sleep(10)
                set_cookies()

        # 获取请求的网页资源
        page_text = bro.page_source
        # 将网页资源转化成HTML，方便后续标签定位
        tree = etree.HTML(page_text)

        print("\n\n第" + str(page + 1) + "次爬取网页，分析信息")

        # 解析标签
        print("开始解析标签")
        parse_label()

    # 当所有数据爬取完毕 再进行以下操作

    # 分析数据
    print("开始分析数据")
    analysis_data()

    # 开始存储信息

    # 写进Excel
    print("开始存储数据到Excel")
    save_excel()

    # 写进TXT文本
    print("开始存储数据到txt")
    save_txt()

    # 画图
    print("画曲线")
    print_curve()

    print("画柱状图")
    print_histogram()

    print("画饼状图")
    print_pie_chart()

    print("\n\n\n数据爬完了\n\n\n")
    bro.quit()