从今天开始,将陆陆续续把最近做的二十多家电商平台的爬虫分进行发布
今天日电商网站:
https://www.alipearlhair.com/
一家面向全球的国外假发网站
主页如图
可以看到,这家网站包含了7个一级导航菜单,放上鼠标还会发现多个2级3级菜单
我们第一步要做的就是获取所有菜单的地址和名称,为下一步获取菜单内部商品做准备;
如果网站较多,为方便调度,最好将每个网站单独入库,以备调用
这里用到了代理,因为是经常跑,如果只是测试,不需要
我们使用准备好的4张表来存储数据,分别是web、menu、spu、sku表对应网站、导航、商品、规格
# -*-coding:utf-8
# author:lihaizhen
# date:
# description:
import requests
import time
from lxml import etree
from competitor_product.utils import connections
conn = connections.mysql_conn()
poor = connections.local_redis(0)
cur = conn.cursor()
class Alipearlhair_menu(object):
def __init__(self):
self.proxies = None
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
'cache-control': 'max-age=0',
'cookie': '__cfduid=d75c9a68fe4ad215c631b9d6c64bb11661600390054; PHPSESSID=geh7bi0rgestoo7t89q28agvi0; _ga=GA1.2.1243901573.1600390060; _gid=GA1.2.110095319.1600390060; cc80df2044f9acef895f69c126d69935=6ISXnXrwlGg%3DjsMlt260T0I%3DiPBdqLCIy70%3DQLp6eKzzHCc%3DKWUMRmCFrOk%3DT%2FmtcKAqrGQ%3DkWTgn%2Fz7x3Y%3Dj0r%2BtEVWJpI%3DBDrvzaV1NaY%3DQhlTCl5Bcdw%3DjeEaLgyY7j4%3D%2BqSRgmPtYKs%3DNjdlU7YUnuQ%3DMPJPGY8FpVE%3Dc%2Fg0GyZy7Fk%3D; _gat=1; __atuvc=11%7C38; __atuvs=5f6403ab798e96ec00a',
'referer': 'https://www.evawigs.com',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
self.host_url = 'https://www.alipearlhair.com/'
def get_web_id(self):
select_sql = """select id from web WHERE url='{}'""".format(self.host_url)
cur = conn.cursor()
cur.execute(select_sql)
id = cur.fetchone()
if id:
return id[0]
else:
return False
def req_alipearlhair(self,create_time):
web_id = self.get_web_id()
# response = get_response(proxies=self.proxies,url=url, headers=self.headers,method='get')
response = requests.get(url=self.host_url, headers=self.headers)
res = response.text
html = etree.HTML(res)
# 定位
element_list = html.xpath('//*[@id="bs-example-navbar-collapse-1"]/ul/li[position()>1 and position()<9]')
data_list = []
for i,element in enumerate(element_list):
menu_1 = element.xpath('./a/text()')[0].replace("'","''").strip().replace("\n","")
menu_1_url = self.host_url + element.xpath('./a/@href')[0]
print('Ⅰ '+menu_1)
menu_2_list = element.xpath('./div/div/dl')
if not menu_2_list:
data_list.append("('{}','{}','{}','{}','{}',{})".format(menu_1_url, menu_1, '', '', create_time,web_id))
continue
for m2 in menu_2_list:
try:
menu_2 = m2.xpath('./dt/a/text()')[0].replace('\n','').replace('\r', '').replace('\t', '').replace("'","''").strip()
menu_2_url = m2.xpath('./dt/a/@href')[0]
print('Ⅱ ' + menu_2)
menu_3_list = m2.xpath('./dd')
if not menu_3_list:
data_list.append("('{}','{}','{}','{}','{}',{})".format(menu_2_url, menu_1, menu_2, '', create_time,web_id))
continue
for m3 in menu_3_list:
menu_3 = m3.xpath('./a/text()')[0].replace("'","''").replace("\n","").replace(" ","")
menu_3_url = m3.xpath('./a/@href')[0]
print('Ⅲ '+menu_3)
data_list.append("('{}','{}','{}','{}','{}',{})".format(menu_3_url, menu_1, menu_2, menu_3, create_time,web_id))
except Exception as e:
continue
return data_list
def save_data(self,data_list):
sql = """insert into menu (url,first_menu,second_menu,third_menu,create_time,web_id) VALUES {}""".format(','.join(data_list))
print(sql)
cur.execute(sql)
conn.commit()
def run(self):
create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
data = self.req_alipearlhair(create_time)
self.save_data(data)
if __name__ == '__main__':
h = Alipearlhair_menu()
h.run()
这里我们得到了以下数据
第二步:使用第一步得到的结果,调度,分别进入每个导航获取商品列表即spu
# author:lihaizhen
# date:
# description:done
import time
import redis
import requests
from lxml import etree
from competitor_product.utils import connections,save_data,get_web_id
class Alipearlhair_Spu(object):
def __init__(self):
self.conn = connections.mysql_conn()
self.poor = connections.local_redis(0)
self.cur = self.conn.cursor()
self.session = requests.Session()
self.host = 'https://www.alipearlhair.com/'
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
'cookie': '_y=a8931b19-e949-4ba8-9582-0aa4a94ea7f2; secure_customer_sig=; _shopify_y=a8931b19-e949-4ba8-9582-0aa4a94ea7f2; _shopify_fs=2020-09-22T03%3A09%3A25.905Z; _ga=GA1.2.52149075.1600744166; _hjid=6b8a213c-e597-49fb-ac6c-db4277787d35; _gcl_au=1.1.331237812.1609924414; _fbp=fb.1.1609924439842.734923350; _pin_unauth=dWlkPU1qVmxPR001WWpNdE5qWmlNQzAwWVdRMUxXSmxObVl0WmpCaU5UZ3paVEUxWkdZeg; _orig_referrer=; _landing_page=%2F%2Fcollections%2Fwigs-at-the-original-price; lkvw_20=www.hairvivi.com//collections/wigs-at-the-original-price; lkvw_02=v5; _hjTLDTest=1; _hjAbsoluteSessionInProgress=1; _sp_ses.8c34=*; KL_FORMS_MODAL={%22disabledForms%22:{%22TTXVBG%22:{%22lastCloseTime%22:1615513536%2C%22successActionTypes%22:[]}}%2C%22viewedForms%22:{%22TTXVBG%22:2199169}}; _gid=GA1.2.1243998432.1615513536; _ps_session=T1xCWrJH9J4M-DtMy8YLe; _g1597052385=VVNE; _s=83981dac-59fe-43ff-ac2e-7980bef490e3; _shopify_s=83981dac-59fe-43ff-ac2e-7980bef490e3; _shopify_sa_p=; epb_previous_pathname=//collections/wig-with-bangs; __kla_id=eyIkcmVmZXJyZXIiOnsidHMiOjE2MDA3NDQxNjksInZhbHVlIjoiIiwiZmlyc3RfcGFnZSI6Imh0dHBzOi8vd3d3LmhhaXJ2aXZpLmNvbS8ifSwiJGxhc3RfcmVmZXJyZXIiOnsidHMiOjE2MTU1MTgxMzEsInZhbHVlIjoiIiwiZmlyc3RfcGFnZSI6Imh0dHBzOi8vd3d3LmhhaXJ2aXZpLmNvbS8vY29sbGVjdGlvbnMvd2lnLXdpdGgtYmFuZ3M/cD0wIn19; _shopify_sa_t=2021-03-12T03%3A02%3A10.574Z; _sp_id.8c34=0bfda82db3ce3969.1600744168.12.1615518141.1614235782',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
def get_spu_per_page(self,menu_id,menu_url,create_time):
for i in range(1,100):
page_url =menu_url +'?___store=en&p={}'.format(i)
print('{}-{}'.format(i,page_url))
response = requests.get(url=page_url,headers=self.headers).text
html = etree.HTML(response)
items_list = html.xpath('//div[@class="category-products"]/ul/li')
print(len(items_list))
duplicate = False
for el in items_list:
try:
name = el.xpath('./div[1]/h2/a/text()')[0].replace(' ','').replace('\r','').replace('\n','')
detail_url = el.xpath('./div[1]/h2/a/@href')[0]
except:
name = el.xpath('./div[2]/h2/a/text()')[0].replace(' ','').replace('\r','').replace('\n','')
detail_url = el.xpath('./div[2]/h2/a/@href')[0]
if 'www' not in detail_url:
detail_url = self.host + detail_url
md5 = get_web_id.get_md5(detail_url)
if save_data.save_spu(menu_id, detail_url, name, create_time, md5, self.conn, self.cur):
duplicate = True
if not duplicate or len(items_list) < 12:
break
self.headers['referer'] = page_url
print('\n')
def run(self,key):
create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
r = redis.Redis(connection_pool=self.poor)
keyword = "{}_spu_url".format(key)
while r.scard(keyword) > 0:
message = r.spop(keyword)
msg = message.split('|')
menu_id = msg[0]
menu_url = msg[1]
try:
self.get_spu_per_page(menu_id,menu_url,create_time)
except Exception as e:
print('rollback')
r.sadd(keyword,message)
if __name__ == '__main__':
key = 'alipearlhair'
h = Alipearlhair_Spu()
h.run(key)
这里将得到每个商品的名称、详情页链接、所属导航
第三步:调度第二步得到的信息,获取sku
# coding:gbk
# author:lihaizhen
# date:
# description:doing
import hashlib
import re
from lxml import etree
import redis
import sys
import requests
sys.path.append("..")
import time
from utils import check_spu,connections,get_res,get_web_id,save_data
from itertools import product
import collections
from decimal import Decimal
class Alipearlhair_Sku(object):
def __init__(self):
self.proxies = None
self.conn = connections.mysql_conn()
self.cur = self.conn.cursor()
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
'cache-control': 'max-age=0',
# 'cookie': '_ga=GA1.2.1243901573.1600390060; _fbp=fb.1.1600413991177.1486909458; _pin_unauth=dWlkPU1qVmxPR001WWpNdE5qWmlNQzAwWVdRMUxXSmxObVl0WmpCaU5UZ3paVEUxWkdZeg; __cfduid=d79191ff3130791135d2a9388c77df9971610592284; PHPSESSID=sh8iij8s3m334t690r5h34mn40; _gid=GA1.2.2030092908.1611649273; cc80df2044f9acef895f69c126d69935=6ISXnXrwlGg%3D7j%2BqiPYoGnU%3DJqUwmfVtkx0%3DuQCklErX%2Fy0%3DKWUMRmCFrOk%3DT%2FmtcKAqrGQ%3DkWTgn%2Fz7x3Y%3Dj0r%2BtEVWJpI%3DBDrvzaV1NaY%3DDycV9b4fHdI%3DygzLQ4aA4%2FY%3DXjz8%2BrL7ktQ%3DGxR7A7PSD0E%3DkMkGB7TMXtU%3DbbTu01xcU0E%3DAc%2BUNJjqGA8%3D; __atuvc=2%7C1%2C5%7C2%2C0%7C3%2C3%7C4; __atuvs=600fd0f887a4d9e4002',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
self.pool = connections.local_redis(1)
self.r = redis.Redis(connection_pool=self.pool)
def get_md5(self,src):
m2 = hashlib.md5()
m2.update(src.encode())
return m2.hexdigest()
def get_sku_temp(self,url):
response = requests.get(url=url,headers=self.headers)
html = etree.HTML(response.text)
sku_list = html.xpath('//*[@id="product-options-wrapper"]/dl/dd') # 属性列表
goods_name = html.xpath('//*[@id="product_addtocart_form"]/div[3]/ul/li[1]/h1/text()')[0].replace('\n','').replace(' ','')
try:
base_price = html.xpath('//dd[@class="pricebox"]/div/p[1]/span[2]/text()')[0].replace('$','').replace('\n','').replace(' ','')
if not base_price:
base_price = html.xpath('//dd[@class="pricebox"]/div/span/span/text()')[0].replace('$','').replace('\n','').replace(' ','')
except:
base_price = html.xpath('//dd[@class="pricebox"]/div/span/span/text()')[0].replace('$','').replace('\n','').replace(' ','')
li = [] #
temp_list = []
for s in sku_list:
attribute_value_list = s.xpath("./div/select/option[@value!='']") # 属性下拉菜单列表
if not attribute_value_list:
attribute_value_list = [s]
li1 = [] #
temp1 = collections.OrderedDict()
for attribute in attribute_value_list:
temp2 = collections.OrderedDict()
name = attribute.xpath('./text()')[0].replace('\n','').replace(' ','')
if 'select' in name.lower():
continue
# price = attribute.xpath('./@price')[0]
if '$' not in name:
price = '0'
else:
price = re.search(r'(\$.*)[\s\S]', name).group(1)
temp2[name] = price
li1.append(name) if name else li1
temp1.update(temp2) if name else temp1
temp_list.append(temp1) if temp1 else temp_list
li.append(li1) if li1 else li
lis = list(product(*li))
return lis,temp_list,goods_name,base_price
def parse_zuhe_list(self,lis,temp_list,base_price,goods_name,url,create_time):
data_list = []
for i,li in enumerate(lis):
total = {}
zuhe = li
price = Decimal(base_price)
for j,p in enumerate(li):
attribute_title = li[j]
ajax_price = temp_list[j][attribute_title]
ajax_price = Decimal(ajax_price.replace(' ', '').replace('$', ''))
price += ajax_price
total['price'] = price
total['sku_name'] = ' / '.join(zuhe).replace("'","''")
total['url'] = url
total['goods_name'] = goods_name.replace("'","''")
total['create_time'] = create_time
total['md5'] = self.get_md5(total['sku_name']+goods_name)
data_list.append(total)
return data_list
def run(self,create_time,r_k):
detail_url_list = [
'https://www.alipearlhair.com/alipearl-hair-peruvian-virgin-hair-straight-lace-frontal-wigs.html',
'https://www.alipearlhair.com/long-wigs-human-lace-front-wigs-straight-frontal-wigs-24-40-inch.html',
'https://www.alipearlhair.com/ly-lace-front-wigs-straight-wigs-invisible-lace-natural-hair-wigs.html',
'https://www.alipearlhair.com/straight-hairstyles-13-6-lace-frontal-wig-high-quality-wigs.html',
'https://www.alipearlhair.com/hd-lace-wigs-13x6-lace-front-wig-hd-transparent-lace-straight-wigs.html',
'https://www.alipearlhair.com/alipearl-hair-new-arrival-cheap-human-hair-wigs-straight-360-lace-frontal-wigs.html',
'https://www.alipearlhair.com/alipearl-brazilian-virgin-hair-straight-full-lace-wigs.html',
'https://www.alipearlhair.com/brazilian-weave-straight-mink-human-hair-bundles-with-lace-frontal-closure.html',
'https://www.alipearlhair.com/mink-hair-weave-brazilian-straight-human-hair-lace-frontal-with-bundles.html',
'https://www.alipearlhair.com/straight-wavy-hair-mink-brazilian-black-weave-hairstyles-hair-bundle-deals.html',
'https://www.alipearlhair.com/mink-brazilian-hair-weave-straight-weave-styles-weave-with-lace-closure.html',
'https://www.alipearlhair.com/alipearl-hair-3-bundles-straight-brazilian.html',
'https://www.alipearlhair.com/alipearl-hair-4-bundles-straight-malaysian.html'
]
for i,url in enumerate(detail_url_list):
print('spu:{}-{}-{}'.format(len(detail_url_list), i+1, url))
lis, temp_list, goods_name, base_price = self.get_sku_temp(url)
data_list = self.parse_zuhe_list(lis,temp_list,base_price,goods_name,url,create_time)
save_data.save_sku(data_list,self.conn,self.cur,self.r,r_k)
time.sleep(5)
if __name__ == '__main__':
ws = Alipearlhair_Sku()
create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
host_url = 'https://www.alipearlhair.com/'
r_k = 'Alipearlhair_Sku'
ws.run(create_time,r_k)
这里将得到商品的名称、价格、属性、地址