抓取Taobao商品数据

import os
import re

import pymongo
import pymysql
import xlwt
from lxml import etree

import requests
from fake_useragent import UserAgent

from redis import ConnectionPool, StrictRedis

# 新建xls工作簿
workbook = xlwt.Workbook(encoding='utf-8')
sheettitle = ['标题', '价格', '商家', '付款人数', '卖家地点', '图片', '详情']

'''
目前发现的有两个缺点:
1.因为淘宝设置了反爬,频繁访问会造成需要重新验证并获取新的cookie 
2.有些商品没有付款人数,正则匹配不到,所有会造成数据长度不匹配
'''


class Taobao:
    # 初始化方法
    def __init__(self):
        # 设置请求头,伪装成浏览器
        self.headers = {
            'user-agent': UserAgent().random,
            'cookie': 'enc=O7JEXyuV57KPiyUGheycM90K1NgZ2ZYpJ4RNw6BL%2BMwlT%2BYAsVkraKhbRGNC3uSkNtHQARfyKRti7rQIUI1e5A%3D%3D; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; _uab_collina=161424111007981574129975; _m_h5_tk=6a3f137662d7fe6b3998f1fc102685a1_1614476040971; _m_h5_tk_enc=5e0eb0ed5c03ee23bd89b3fd84d5eef2; cookie2=1327862b6d9e0b7688c44e475567f4f3; t=507da5cf68f9c19989078df7c8d60777; _tb_token_=ed9e347dee373; xlly_s=1; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _samesite_flag_=true; mt=ci=0_0; tracknick=; cna=PKd5GGvVtT8CAd7VQOCn4Wde; x5sec=7b227365617263686170703b32223a226535303637376564323164353935383064343130653833383538383633613036434a717436344547454d377a6f6447387659624457686f4d4d6a59784d4463314d4463794d7a73784b414977703457436e767a2f2f2f2f2f41513d3d227d; JSESSIONID=070D00A2B8711561FBD59D84BFEAF256; l=eBNx1-GmOqxmdsnEBOfanurza77O7IRYmuPzaNbMiOCPOx1p5elNW6gwvcY9CnGVhstpR3u7hqDaBeYBqIv4n5U62j-laTkmn; isg=BDAwbrssJAkw68fL1CZJSmfpAf6CeRTDfjlIPiqB6Qte5dCP0ovmU2rXPe2F8cyb; tfstk=cSZPBIcWVgIzgumCW0ieNrM9RFmRZ6cn1nkZryQ2eTiDIY3lipRKntOK3vlOm4f..'
        }
        # 数据库的配置与连接
        try:
            mysql_configuration = {
                'host': 'localhost',
                'user': 'admin',
                'password': 'Root110qwe',
                'port': 3306,
                'database': 'Taobaodb',
                'charset': 'utf8',
            }
            self.conn = pymysql.connect(**mysql_configuration)
            self.cursor = self.conn.cursor()
            print('MySQL数据库连接成功')
        except:
            print('MySQL数据库连接失败,请检查重新连接吧')
            return
        # 创建表信息
        try:
            create_sql = '''
                create table taobao(
                    id int primary key auto_increment,
                    search varchar(20),
                    title varchar(255),
                    price float,
                    nick varchar(20),
                    view_sale int,
                    item_loc varchar(20),
                    pic_url text,
                    detail_url text
                ) default CHARSET=utf8;
            '''
            self.cursor.execute(create_sql)
            print('表创建成功')
        except:
            print('表已经存在,直接添加数据即可')
            return

    def send_requets(self, url, params):
        # 发送请求
        response = requests.get(url, headers=self.headers, params=params)

        # 获取请求网站URL
        print(response.url)

        # 从网页的内容中分析网页编码的方式,再赋值给encoding
        response.encoding = response.apparent_encoding
        # 获取响应内容
        text = response.text
        return text

    def get_data(self, search, startpage, stoppage):
        num = 6
        number = 0

        for index in range(startpage, stoppage + 1):
            url = 'https://s.taobao.com/search'
            # 参数构造
            params = {
                'q': search,
                'bcoffset': num,
                'ntoffset': num,
                'p4ppushleft': '1,48',
                's': number * 44
            }
            number += 1
            num -= 3

            text = taobao.send_requets(url, params)

            # 获取商品标题
            titles = re.findall('"raw_title":"(.*?)"', text)

            # 获取商品价格
            prices = re.findall('"view_price":"(.*?)"', text)

            # 获取商品详情页
            detail_urls = re.findall('"detail_url":"(.*?)"', text)

            # 获取商家名
            nicks = re.findall('"nick":"(.*?)"', text)[0:48]

            # 获取商品付款人数
            view_sales = re.findall('"view_sales":"(.*?)"', text)

            # 获取商家地点
            item_locs = re.findall('"item_loc":"(.*?)"', text)

            # 获取商品图片URL
            pic_urls = re.findall('"pic_url":"(.*?)"', text)

            new_detail_urls = []

            # 对数据进行处理
            for detail_url in detail_urls:
                if detail_url.find('https://click.simba.taobao.com') != -1:
                    detail_url = detail_url.replace(r'\u003d', '=').replace(r'\u0026', '&')
                else:
                    detail_url = 'https:' + detail_url.replace(r'\u003d', '=').replace(r'\u0026', '&')
                # print('详情:' + detail_url)
                new_detail_urls.append(detail_url)
            # print(new_detail_urls)

            taobao.save_excel(index, search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls)
            taobao.save_mysql(search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls)
            taobao.save_redis(search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls)
            taobao.save_mongodb(search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls)

    def save_excel(self, index, search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls):
        worksheet = workbook.add_sheet(sheetname='{}第{}页数据'.format(search, index))
        for i in range(0, len(sheettitle)):
            worksheet.write(0, i, sheettitle[i])
            for j in range(1, len(titles) + 1):
                # 第一列
                if (i == 0):
                    worksheet.write(j, i, titles[j - 1])
                # 第二列
                if (i == 1):
                    worksheet.write(j, i, prices[j - 1])
                # 第三列
                if (i == 2):
                    worksheet.write(j, i, nicks[j - 1])
                # 第四列
                if (i == 3):
                    worksheet.write(j, i, view_sales[j - 1])
                # 第五列
                if (i == 4):
                    worksheet.write(j, i, item_locs[j - 1])
                # 第六列
                if (i == 5):
                    worksheet.write(j, i, 'https:' + pic_urls[j - 1])
                # 第七列
                if (i == 6):
                    worksheet.write(j, i, new_detail_urls[j - 1])

        print('工作表{}保存成功'.format(index))
        workbook.save('淘宝商品-{}.xls'.format(search))

    def save_mysql(self, search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls):

        for title, price, nick, view_sale, item_loc, pic_url, new_detail_url in zip(titles, prices, nicks,
                                                                                    view_sales, item_locs, pic_urls,
                                                                                    new_detail_urls):

            # 匹配数字
            view_sale = str(re.findall('\d+', view_sale)[0])
            pic_url = 'https:' + pic_url

            try:
                insert_sql = '''
                    insert into taobao(search,title, price, nick, view_sale, item_loc, pic_url, detail_url) values(%s,%s,%s,%s,%s,%s,%s,%s)
                '''
                data = (search, title, price, nick, view_sale, item_loc, pic_url, new_detail_url)
                # 写入数据
                self.cursor.execute(insert_sql, data)
                # 提交
                self.conn.commit()
                print('《{}》数据添加成功'.format(title))
            except:
                print('数据添加失败,请检查之后重新添加')
                self.conn.rollback()

    def save_redis(self, search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls):
        # 第二种:利用ConnectionPool连接
        # 实现一个连接池,减少频繁的连接、断开数据库的开销
        pool = ConnectionPool(host='localhost', port=6379, db=0, password='admin', )
        # 创建连接对象
        redis = StrictRedis(connection_pool=pool, decode_responses=True)

        datas = dict(zip(titles, new_detail_urls))

        if datas:
            redis.hmset('{}'.format(search), datas)
            print('数据写入redis成功')
        else:
            print('数据不存在')

    def save_mongodb(self, search, titles, prices, nicks, view_sales, item_locs, pic_urls, new_detail_urls):

        lists = []
        for title, price, nick, view_sale, item_loc, pic_url, new_detail_url in zip(titles, prices, nicks,
                                                                                    view_sales, item_locs, pic_urls,
                                                                                    new_detail_urls):
            view_sale = str(re.findall('\d+', view_sale)[0])
            pic_url = 'https:' + pic_url

            dicts = {}
            dicts['search'] = search
            dicts['title'] = title
            dicts['price'] = price
            dicts['nick'] = nick
            dicts['view_sale'] = view_sale
            dicts['item_loc'] = item_loc
            dicts['pic_url'] = pic_url
            dicts['detail_url'] = new_detail_url

            lists.append(dicts.copy())

        try:
            # 创建连接数据库的对象
            client = pymongo.MongoClient(host='localhost', port=27017)

            # 连接数据库,没有则自动创建,数据库创建后要创建集合(数据表)并插入一个文档(记录),数据库才会真正创建
            db = client["Taobaodb"]

            # 使用students集合,没有则自动创建,创建集合(数据表)后要再插入一个文档(记录),集合才会真正创建
            collection = db["taobao"]

            collection.insert_many(lists)

            print('数据写入mongo成功')
        except Exception as e:
            print(e)


if __name__ == '__main__':
    taobao = Taobao()

    search = input('请输入商品名称:')
    startpage = int(input('请输入你需要下载的开始页:'))
    stoppage = int(input('请输入你需要下载的结束页:'))

    taobao.get_data(search, startpage, stoppage)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

LCrush201809

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值