今日电商网站:
https://www.divatress.com/
一家美国假发电商网站
主页如图:
首页包含大量的一级菜单和二级菜单
说明该网站的产品和分类非常的丰富,同类网站中属于少见的优秀
其中一级类目有12个
二级类目更是多达上百个
部分还有三级类目
我们的目的是获取该网站的所有类目信息和商品信息;
该网站相对来说比较友善,因此可以不用代理,但是需要番羌
整体思路是:
1、获取分类
# -*-coding:utf-8
# author:lihaizhen
# date:
# description:done 2020-09-17
import pymysql
import requests
import time
from lxml import etree
from competitor_product.utils import connections
conn = connections.mysql_conn()
poor = connections.local_redis(0)
class Diva_tress(object):
def __init__(self):
self.proxies = None
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
'cache-control': 'max-age=0',
'cookie': '__cfduid=d7ce5378cc638aa0d1dc8c9556eb2e4991582689415; geoip_processed=1; _gcl_au=1.1.439470082.1582689427; _ga=GA1.2.369600989.1582689427; _fbp=fb.1.1582689427175.2008176297; __zlcmid=wwiwPdQqBI4cdn; __atuvc=1%7C10; frontend=je8s9fsjdcfng9f4uv20brk705; frontend_cid=2v5zla7g5Ri7iWEX; productlist=; googlecategory=; _gid=GA1.2.573730289.1583721678; _dc_gtm_UA-89269615-1=1; _hjid=6b8a213c-e597-49fb-ac6c-db4277787d35',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
self.host_url = 'https://www.divatress.com/'
def get_web_id(self):
select_sql = """select id from web WHERE url='{}'""".format(self.host_url)
cur = conn.cursor()
cur.execute(select_sql)
id = cur.fetchone()
if id:
return id[0]
else:
return False
def req_shes_happy_hair(self,create_time):
web_id = self.get_web_id()
# response = get_response(proxies=self.proxies,url=url, headers=self.headers,method='get')
response = requests.get(url=self.host_url, headers=self.headers)
res = response.text
html = etree.HTML(res)
# 定位
element_list = html.xpath('//*[@id="navbarStickyDesktop"]/div[2]/ul/li[position()<12]')
data_list = []
for i,element in enumerate(element_list):
menu_1 = element.xpath('./a/text()')[0].replace("'","''")
menu_1_url = element.xpath('./a/@href')[0]
print('Ⅰ '+menu_1)
# if i == 0:
menu_2_list = element.xpath('./ul/li/div/div')
if not menu_2_list:
data_list.append("('{}','{}','{}','{}','{}',{})".format(self.host_url+menu_1_url, menu_1, '', '', create_time,web_id))
continue
for m2 in menu_2_list:
try:
menu_2 = m2.xpath('./h5/a/text()')[0].replace(' ','').replace('\n','').replace('\r', '').replace('\t', '').replace("'","''")
menu_2_url = m2.xpath('./h5/a/@href')[0]
print('Ⅱ ' + menu_2)
menu_3_list = m2.xpath('./ul/li')
if not menu_3_list:
data_list.append("('{}','{}','{}','{}','{}',{})".format(self.host_url+menu_2_url, menu_1, menu_2, '', create_time,web_id))
continue
for m3 in menu_3_list:
menu_3 = m3.xpath('./a/text()')[0].replace("'","''")
menu_3_url = m3.xpath('./a/@href')[0]
print('Ⅲ '+menu_3)
data_list.append("('{}','{}','{}','{}','{}',{})".format(self.host_url+menu_3_url, menu_1, menu_2, menu_3, create_time,web_id))
except Exception as e:
continue
# else:
# menu_2_list = element.xpath('./ul/li')
# for m2 in menu_2_list:
# menu_2 = m2.xpath('./a/div[2]/span/text()')[0]
# print('-- ' + menu_2)
# menu_3 = m2.xpath('./a/div[2]/p/text()')[0]
# print('--- ' + menu_3)
# data_list.append("('{}','{}','{}','{}','{}')".format(url, menu_1, menu_2, menu_3, create_time))
return data_list
def save_data(self,data_list):
sql = """insert into menu (url,first_menu,second_menu,third_menu,create_time,web_id) VALUES {}""".format(','.join(data_list))
print(sql)
cur = conn.cursor()
cur.execute(sql)
conn.commit()
def run(self):
create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
web_id =self.get_web_id()
data = self.req_shes_happy_hair(create_time)
self.save_data(data)
if __name__ == '__main__':
d = Diva_tress()
d.run()
我们如愿的得到了该网站的所有类目数据
2、获取商品
# author:lihaizhen
# date:
# description:done
import sys
import redis
import requests
sys.path.append("..")
import time
from lxml import etree
from competitor_product.utils import connections,get_res,save_data,get_web_id
class Divatress_spu(object):
def __init__(self):
self.conn = connections.mysql_conn()
self.poor = connections.local_redis(0)
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
self.cur = self.conn.cursor()
self.session = requests.Session()
self.proxies = {
}
self.spu_queue = None
def get_response(self, url, method, **kwgs):
try:
if method == 'get':
# self.headers1['referer'] = url
res = requests.get(url=url, headers=self.headers, proxies=self.proxies, timeout=30)
res = res.text
# if 'computer network which appear to be in violation of the' in res:
# raise requests.exceptions.ConnectionError
return res
elif method == 'post':
res = self.session.post(url=url, headers=self.headers, data=kwgs['data'], proxies=self.proxies,
timeout=30)
return res
except requests.exceptions.ConnectionError as e:
print('代理失效,切換代理重試')
res = self.get_response(url, method)
return res
def get_res(self,u,p):
if p == 1:
response, self.proxies = get_res.get_response(proxies=self.proxies, url=u, headers=self.headers,
method='get')
else:
if '?' in u:
url = u + '&p={}'.format(p)
else:
url = u + '?p={}'.format(p)
print(url)
response, self.proxies = get_res.get_response(proxies=self.proxies, url=url, headers=self.headers,
method='get')
return response
def get_items_number(self,url):
# res = self.get_response(url,'get')
res = requests.get(url=url,headers=self.headers).text
html = etree.HTML(res)
try:
items = html.xpath('//*[@id="category-sticky-products"]/div[1]/div/p/text()')[0].split(' ')[0]
except Exception as e:
items = 0
print('total_items:{}'.format(items))
return int(items)
def get_spu_per_page(self,menu_id,menu_url,create_time):
url = menu_url
items = self.get_items_number(url)
total_page = int(items) / 24
if '.' in str(total_page):
total_page = int(total_page) + 1
print('total_page:{}'.format(total_page))
nu = 1
for i in range(nu,total_page+1):
print(url)
# res = self.get_response(url,'get')
res = requests.get(url=url, headers=self.headers).text
html = etree.HTML(res)
items_list = html.xpath('//*[@id="category-sticky-products"]/div[2]/div/div[2]')
for j,item in enumerate(items_list):
print('page {}({})/{}th({})'.format(i,total_page,j,len(items_list)))
name = item.xpath('./div[2]/h5/text()')[0].replace("'","''")
p_url ='https://www.divatress.com/' + item.xpath('./a/@href')[0]
md5 = get_web_id.get_md5(p_url)
save_data.save_spu(menu_id, p_url, name, create_time, md5, self.conn, self.cur)
self.headers['referer'] = url
try:
url = menu_url + '?page={}'.format(i+1)
except Exception as e:
print(e)
def run(self,key):
create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
r = redis.Redis(connection_pool=self.poor)
keyword = "{}_spu_url".format(key)
while r.scard(keyword) > 0:
message = r.spop(keyword)
msg = message.split('|')
menu_id = msg[0]
menu_url = msg[1]
try:
self.get_spu_per_page(menu_id,menu_url,create_time)
except Exception as e:
print('rollback')
r.sadd(keyword,message)
if __name__ == '__main__':
key = 'divatress'
d = Divatress_spu()
d.run(key)
这里我们可以得到每个类目下的产品列表
3、商品属性
下一步根据产品列表可以获取所有sku信息
代码:略
我们得到了产品不同规格的价格信息