电商网站爬虫案例(2)

今日电商网站:

https://www.divatress.com/

一家美国假发电商网站

主页如图:

首页包含大量的一级菜单和二级菜单

说明该网站的产品和分类非常的丰富,同类网站中属于少见的优秀

其中一级类目有12个

二级类目更是多达上百个

部分还有三级类目

我们的目的是获取该网站的所有类目信息和商品信息;

该网站相对来说比较友善,因此可以不用代理,但是需要番羌

整体思路是:

1、获取分类

# -*-coding:utf-8
# author:lihaizhen
# date:
# description:done 2020-09-17

import pymysql
import requests
import time
from lxml import etree
from competitor_product.utils import connections
conn = connections.mysql_conn()
poor = connections.local_redis(0)

class Diva_tress(object):
    def __init__(self):
        self.proxies = None
        self.headers = {
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'accept-encoding': 'gzip, deflate',
                    'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
                    'cache-control': 'max-age=0',
                    'cookie': '__cfduid=d7ce5378cc638aa0d1dc8c9556eb2e4991582689415; geoip_processed=1; _gcl_au=1.1.439470082.1582689427; _ga=GA1.2.369600989.1582689427; _fbp=fb.1.1582689427175.2008176297; __zlcmid=wwiwPdQqBI4cdn; __atuvc=1%7C10; frontend=je8s9fsjdcfng9f4uv20brk705; frontend_cid=2v5zla7g5Ri7iWEX; productlist=; googlecategory=; _gid=GA1.2.573730289.1583721678; _dc_gtm_UA-89269615-1=1; _hjid=6b8a213c-e597-49fb-ac6c-db4277787d35',
                    'sec-fetch-dest': 'document',
                    'sec-fetch-mode': 'navigate',
                    'sec-fetch-site': 'none',
                    'sec-fetch-user': '?1',
                    'upgrade-insecure-requests': '1',
                    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
                }
        self.host_url = 'https://www.divatress.com/'

    def get_web_id(self):
        select_sql = """select id from web WHERE url='{}'""".format(self.host_url)
        cur = conn.cursor()
        cur.execute(select_sql)
        id = cur.fetchone()
        if id:
            return id[0]
        else:
            return False

    def req_shes_happy_hair(self,create_time):
        web_id = self.get_web_id()
        # response = get_response(proxies=self.proxies,url=url, headers=self.headers,method='get')
        response = requests.get(url=self.host_url, headers=self.headers)
        res = response.text
        html = etree.HTML(res)
        # 定位
        element_list = html.xpath('//*[@id="navbarStickyDesktop"]/div[2]/ul/li[position()<12]')
        data_list = []
        for i,element in enumerate(element_list):
            menu_1 = element.xpath('./a/text()')[0].replace("'","''")
            menu_1_url = element.xpath('./a/@href')[0]
            print('Ⅰ  '+menu_1)
            # if i == 0:
            menu_2_list = element.xpath('./ul/li/div/div')
            if not menu_2_list:
                data_list.append("('{}','{}','{}','{}','{}',{})".format(self.host_url+menu_1_url, menu_1, '', '', create_time,web_id))
                continue
            for m2 in menu_2_list:
                try:
                    menu_2 = m2.xpath('./h5/a/text()')[0].replace('  ','').replace('\n','').replace('\r', '').replace('\t', '').replace("'","''")
                    menu_2_url = m2.xpath('./h5/a/@href')[0]
                    print('Ⅱ ' + menu_2)
                    menu_3_list = m2.xpath('./ul/li')
                    if not menu_3_list:
                        data_list.append("('{}','{}','{}','{}','{}',{})".format(self.host_url+menu_2_url, menu_1, menu_2, '', create_time,web_id))
                        continue
                    for m3 in menu_3_list:
                        menu_3 = m3.xpath('./a/text()')[0].replace("'","''")
                        menu_3_url = m3.xpath('./a/@href')[0]
                        print('Ⅲ '+menu_3)
                        data_list.append("('{}','{}','{}','{}','{}',{})".format(self.host_url+menu_3_url, menu_1, menu_2, menu_3, create_time,web_id))
                except Exception as e:
                    continue
            # else:
            #     menu_2_list = element.xpath('./ul/li')
            #     for m2 in menu_2_list:
            #         menu_2 = m2.xpath('./a/div[2]/span/text()')[0]
            #         print('-- ' + menu_2)
            #         menu_3 = m2.xpath('./a/div[2]/p/text()')[0]
            #         print('--- ' + menu_3)
            #         data_list.append("('{}','{}','{}','{}','{}')".format(url, menu_1, menu_2, menu_3, create_time))
        return data_list

    def save_data(self,data_list):
        sql = """insert into menu (url,first_menu,second_menu,third_menu,create_time,web_id) VALUES {}""".format(','.join(data_list))
        print(sql)
        cur = conn.cursor()
        cur.execute(sql)
        conn.commit()

    def run(self):
        create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        web_id =self.get_web_id()
        data = self.req_shes_happy_hair(create_time)
        self.save_data(data)

if __name__ == '__main__':
    d = Diva_tress()
    d.run()

我们如愿的得到了该网站的所有类目数据

2、获取商品

# author:lihaizhen
# date:
# description:done
import sys
import redis
import requests
sys.path.append("..")
import time
from lxml import etree
from competitor_product.utils import connections,get_res,save_data,get_web_id


class Divatress_spu(object):
    def __init__(self):
        self.conn = connections.mysql_conn()
        self.poor = connections.local_redis(0)

        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }

        self.cur = self.conn.cursor()
        self.session = requests.Session()

        self.proxies = {

        }
        self.spu_queue = None

    def get_response(self, url, method, **kwgs):
        try:
            if method == 'get':
                # self.headers1['referer'] = url
                res = requests.get(url=url, headers=self.headers, proxies=self.proxies, timeout=30)
                res = res.text
                # if 'computer network which appear to be in violation of the' in res:
                #     raise requests.exceptions.ConnectionError
                return res
            elif method == 'post':
                res = self.session.post(url=url, headers=self.headers, data=kwgs['data'], proxies=self.proxies,
                                        timeout=30)
                return res
        except requests.exceptions.ConnectionError as e:
            print('代理失效,切換代理重試')
            res = self.get_response(url, method)
            return res

    def get_res(self,u,p):
        if p == 1:
            response, self.proxies = get_res.get_response(proxies=self.proxies, url=u, headers=self.headers,
                                                          method='get')
        else:
            if '?' in u:
                url = u + '&p={}'.format(p)
            else:
                url = u + '?p={}'.format(p)
            print(url)
            response, self.proxies = get_res.get_response(proxies=self.proxies, url=url, headers=self.headers,
                                                          method='get')
        return response

    def get_items_number(self,url):
        # res = self.get_response(url,'get')
        res = requests.get(url=url,headers=self.headers).text
        html = etree.HTML(res)
        try:
            items = html.xpath('//*[@id="category-sticky-products"]/div[1]/div/p/text()')[0].split(' ')[0]
        except Exception as e:
            items = 0
        print('total_items:{}'.format(items))
        return int(items)

    def get_spu_per_page(self,menu_id,menu_url,create_time):
        url = menu_url
        items = self.get_items_number(url)
        total_page = int(items) / 24
        if '.' in str(total_page):
            total_page = int(total_page) + 1
        print('total_page:{}'.format(total_page))
        nu = 1
        for i in range(nu,total_page+1):
            print(url)
            # res = self.get_response(url,'get')
            res = requests.get(url=url, headers=self.headers).text
            html = etree.HTML(res)
            items_list = html.xpath('//*[@id="category-sticky-products"]/div[2]/div/div[2]')
            for j,item in enumerate(items_list):
                print('page {}({})/{}th({})'.format(i,total_page,j,len(items_list)))
                name = item.xpath('./div[2]/h5/text()')[0].replace("'","''")
                p_url ='https://www.divatress.com/' + item.xpath('./a/@href')[0]
                md5 = get_web_id.get_md5(p_url)
                save_data.save_spu(menu_id, p_url, name, create_time, md5, self.conn, self.cur)
            self.headers['referer'] = url
            try:
                url = menu_url + '?page={}'.format(i+1)
            except Exception as e:
                print(e)

    def run(self,key):
        create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        r = redis.Redis(connection_pool=self.poor)
        keyword = "{}_spu_url".format(key)
        while r.scard(keyword) > 0:
            message = r.spop(keyword)
            msg = message.split('|')
            menu_id = msg[0]
            menu_url = msg[1]
            try:
                self.get_spu_per_page(menu_id,menu_url,create_time)
            except Exception as e:
                print('rollback')
                r.sadd(keyword,message)

if __name__ == '__main__':
    key = 'divatress'
    d = Divatress_spu()
    d.run(key)

这里我们可以得到每个类目下的产品列表

3、商品属性

下一步根据产品列表可以获取所有sku信息

代码:略

我们得到了产品不同规格的价格信息

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值