直接上爬取逻辑代码,seleinum的使用可以百度
先爬item_id,再爬item的具体信息以及sku信息
淘宝店铺分天猫、淘宝和极有家,爬虫处理有些许不同
20201203-新增1688
查找url规律
天猫店铺:https://xxx.tmall.com/
淘宝店铺:https://xxx123.taobao.com/
1688店铺:https://xxx.1688.com/
商品详情:https://xxx123.taobao.com/category.htm # 商品详情链接通用
1688商品详情:https://xxx.1688.com/page/offerlist.htm
天猫商品详情sku:https://detail.tmall.com/item.htm?id=123456&skuId=123456
淘宝商品详情sku:https://item.taobao.com/item.htm?id=123456
极有家商品详情sku:https://item.taobao.com/item.htm?id=123456
1688商品详情sku:https://detail.1688.com/offer/123456.html
先爬item_id
以下代码三种店铺通用>.< [终于从三个函数浓缩成一个函数了,哈哈哈]
# 浏览器设置[此处选择的谷歌]
chrom_driver = 'chromedriver存储地址'
browser = webdriver.Chrome(executable_path=chrom_driver, chrome_options=chrome_options)
# 打开商品店铺页面
browser.get(url + 'category.htm')
# 关闭登录窗口
try:
self.driver.implicitly_wait(5)
self.driver.find_element_by_id('sufei-dialog-close').click()
self.driver.implicitly_wait(20)
except:
pass
# 模拟真人滚动
self.driver.execute_script("window.scrollBy(0,1000)")
random_sleep()
# 等待下一页出现
next_page = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.J_SearchAsync')))
# 总页数
page_all = len(self.driver.find_elements_by_css_selector('.J_SearchAsync'))
item_id = [] # 用于存储爬取的item_id
while True:
try:
self.driver.execute_script("window.scrollBy(0,1000)")
random_sleep()
# 开始爬取商品基本信息
selector = Selector(text=self.driver.page_source)
# 商品item_id
item_id += selector.css('.item::attr(data-id)').extract()
except Exception as e:
continue
try:
# 下一页
next_page_a_tag = self.driver.find_element_by_css_selector('.J_SearchAsync.next')
next_page_a_tag.click()
except:
break # 下一页无法点击时,即最后一页,跳出循环
return list(set(item_id))
# 爬取结束后将item_id存到数据库GoodsItems中
# 1688
item_id = []
while True:
try:
self.driver.execute_script("window.scrollBy(0,1000)")
random_sleep()
# 开始爬取商品基本信息
selector = Selector(text=self.driver.page_source)
# 商品item_id
item_id += selector.css('.offer-list-row>li::attr(data-offerid)').extract()
except Exception as e:
continue
try:
# 下一页
next_page_a_tag = self.driver.find_element_by_css_selector('.pagination>.next')
random_sleep()
random_sleep()
next_page_a_tag.click()
random_sleep()
random_sleep()
except:
break
return list(set(item_id))
再根据item_id查找商品详细信息及sku信息
因为每种店铺的商品详情页面有很大不同,无法浓缩成一个函数,所以在数据库GoodsItems中应该添加一个店铺来源字段,用于判断商品的来源,从而决定执行哪个函数
天猫
self.driver.get('https://detail.tmall.com/item.htm?id={item_id}'.format(item_id=item_id))
# 关闭登录窗口
try:
self.driver.implicitly_wait(5)
self.driver.find_element_by_id('sufei-dialog-close').click()
self.driver.implicitly_wait(20)
except:
pass
# 模拟真人浏览滑动
self.driver.execute_script("window.scrollBy(0,1000)")
random_sleep()
self.driver.execute_script("window.scrollBy(0,-1000)")
random_sleep()
try:
# 等待价格出现
price = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.tm-price')))
# 等待轮播图出现
banner_images = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_ImgBooth')))
except:
self.driver.get('https://detail.tmall.com/item.htm?id={item_id}'.format(item_id=item_id))
html = self.driver.page_source
selector = Selector(text=html)
try:
down_text = self.driver.find_element_by_xpath('//div[@class="sold-out-left"]/strong')
if down_text:
print('此商品已下架')
return
except:
pass
data = []
ul_list = self.driver.find_elements_by_css_selector('.tm-clear.J_TSaleProp')
for ul in ul_list:
data_value = ul.find_elements_by_css_selector('li')[0].get_attribute('data-value')
choice = ul.find_element_by_css_selector('li').get_attribute('class')
if len(choice) == 0 or 'tb-selected' not in choice:
ul.find_elements_by_css_selector('li')[0].click()
if 'tb-out-of-stock' in choice:
print('该分类库存不足')
return
data.append(data_value)
data = ';'.join(data)
sku_prices = self.driver.find_elements_by_css_selector('.tm-price')
sku_price = sku_prices[0].text
postage_text = self.driver.find_element_by_xpath('//div[@id="J_PostageToggleCont"]').text # 邮费
if postage_text == '':
postage = 0
else:
postage = re.findall(r' (.*)', postage_text)[0]
print(postage)
try:
sku_promo_price = sku_prices[1].text
except:
sku_promo_price = ''
sku_name = ''
if data:
try:
sku_name = selector.css('.tm-clear.J_TSaleProp.tb-img>li')[0].css('span::text')[0].extract()
except:
try:
sku_name = selector.css('.tm-clear.J_TSaleProp>li')[0].css('a>span::text')[0].extract()
except:
pass
json_pattren = re.compile(f'{data}.*?skuId":"(.*?)"', re.S)
else:
json_pattren = re.compile(r'skuId":"(.*?)","oversold', re.S)
sku_ids = re.findall(json_pattren, html)
if sku_ids:
sku_id = sku_ids[0]
else:
sku_id = ''
defaults = {
'item_id': item_id,
'name': sku_name,
'price': sku_price,
'promo_price': sku_promo_price,
}
filter_kwargs = {
'sku_id': sku_id,
}
print('商品sku信息爬取结束,数据入库')
print(defaults, filter_kwargs)
# 开始爬取商品基本信息
# 商品标题
title = selector.css('.tb-detail-hd>h1::text').extract_first().strip()
# 商品描述
desc = selector.css('.attributes-list').xpath('string(.)').extract_first().strip()
# 商品轮播图
images = []
banner_image_list = selector.css('#J_UlThumb>li>a>img::attr(src)').extract()
for detail_image in banner_image_list:
images.append('https:' + detail_image.replace('_60x60q90.jpg', ''))
desc = re.sub(self.tab_replace_re_pattern, '\t', desc)
desc = re.sub('[ \f\r\t\v\xa0]', ' ', desc)
desc = re.sub(' +', ' ', desc)
defaults = {
'title': title,
'desc': desc,
'banner_images': images,
'detail_images': []
}
filter_kwargs = {
'item_id': item_id
}
print('商品信息爬取结束,数据入库')
print(defaults)
淘宝&极有家
self.driver.get('https://item.taobao.com/item.htm?id={item_id}'.format(item_id=item_id))
# 关闭登录窗口
try:
self.driver.implicitly_wait(5)
self.driver.find_element_by_id('sufei-dialog-close').click()
self.driver.implicitly_wait(20)
except:
pass
# 模拟真人浏览页面
self.driver.execute_script("window.scrollBy(0,1000)")
random_sleep()
self.driver.execute_script("window.scrollBy(0,-1000)")
random_sleep()
try:
# 等待价格出现
price = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.tb-rmb-num')))
# 等待轮播图出现
banner_images = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_ImgBooth')))
except:
self.driver.get('https://item.taobao.com/item.htm?id={item_id}'.format(item_id=item_id)) # 这边尝试了selenium的刷新、后退、退出操作都无反应(我电脑上是没有用,也许你们可以),这是最蠢的再次打开网页
html = self.driver.page_source
selector = Selector(text=html)
try:
down_text = self.driver.find_element_by_xpath('//p[@class="tb-hint"]/strong')
if down_text:
print('此商品已下架')
return
except:
pass
data = []
ul_list = self.driver.find_elements_by_css_selector('.J_TSaleProp.tb-clearfix')
for ul in ul_list:
data_value = ul.find_elements_by_css_selector('li')[0].get_attribute('data-value')
choice = ul.find_element_by_css_selector('li').get_attribute('class')
if len(choice) == 0 or 'tb-selected' not in choice:
ul.find_elements_by_css_selector('li')[0].click()
if 'tb-out-of-stock' in choice:
print('该分类库存不足')
return
data.append(data_value)
data = ';'.join(data)
sku_price = self.driver.find_element_by_xpath('//strong[@id="J_StrPrice"]//em[last()]').text
postage_text = self.driver.find_element_by_xpath('//span[@id="J_WlServiceTitle"]').text # 邮费
postages = postage_text.replace('快递 ', '')
if postages == '免运费':
postage = 0
else:
postage = postages.replace('¥', '')
print(postage)
try:
sku_promo_price = self.driver.find_element_by_xpath('//em[@id="J_PromoPriceNum"]').text
except:
sku_promo_price = ''
# 开始爬取商品基本信息
sku_name = ''
if data:
try:
sku_name = selector.css('.J_TSaleProp.tb-img.tb-clearfix>li')[0].css('span::text')[0].extract()
except:
try:
sku_name = selector.css('.J_TSaleProp.tb-clearfix>li')[0].css('a>span::text')[0].extract()
except:
pass
json_pattren = re.compile(f'{data}.*?skuId":"(.*?)","oversold', re.S)
else:
json_pattren = re.compile(r'skuId":"(.*?)","oversold', re.S)
sku_ids = re.findall(json_pattren, html)
if sku_ids:
sku_id = sku_ids[0]
else:
sku_id = ''
defaults = {
'item_id': item_id,
'name': sku_name,
'price': sku_price,
'promo_price': sku_promo_price,
}
filter_kwargs = {
'sku_id': sku_id,
}
print('商品sku信息爬取结束,数据入库')
print(defaults, filter_kwargs)
# 商品标题
title = selector.css('.tb-title>h3::attr(data-title)').extract_first().strip()
# 商品描述
desc = selector.css('.attributes-list').xpath('string(.)').extract_first().strip()
# 商品轮播图
images = []
banner_image_list = selector.css('#J_UlThumb img::attr(data-src)').extract()
for detail_image in banner_image_list:
if re.match(r'https://', detail_image):
images.append(detail_image.replace('_50x50.jpg', ''))
else:
images.append('https:' + detail_image.replace('_50x50.jpg', ''))
desc = re.sub(self.tab_replace_re_pattern, '\t', desc)
desc = re.sub('[ \f\r\t\v\xa0]', ' ', desc)
desc = re.sub(' +', ' ', desc)
defaults = {
'title': title,
'desc': desc,
'banner_images': images,
'detail_images': [],
}
filter_kwargs = {
'item_id': item_id
}
print('商品信息爬取结束,数据入库')
print(defaults)
1688
# 商品名
try:
goods_name = self.driver.find_element_by_xpath('//h1[@class="d-title"]').text
except:
AliItem.objects.filter(item_id=item_id).update(status=AliItem.STATUS_DOWN, note='此商品已下架')
return
# 现货价格
try:
try:
cash_price_list = self.driver.find_element_by_css_selector('.ladder-1-1')
except:
try:
cash_price_list = self.driver.find_element_by_css_selector('.ladder-2-1') # 两个价格选第一个
except:
cash_price_list = self.driver.find_element_by_css_selector('.ladder-3-1') # 三个价格选第一个
except:
try:
cash_price_list = self.driver.find_element_by_css_selector('.obj-price') # 伙拼价
except:
cash_price_list = self.driver.find_element_by_css_selector('.price-val')
cash_price_text = cash_price_list.text.replace('¥', '').replace(' ', '')
if '-' in cash_price_text:
cash_price_min = cash_price_text.split('-')[0]
cash_price_max = cash_price_text.split('-')[1]
else:
cash_price_min = cash_price_max = cash_price_text
# 起批量信息
try:
try:
batch_text = self.driver.find_element_by_xpath('//tr[@class="amount"]').text
except:
batch_text = self.driver.find_element_by_xpath('//tr[@class="price-begin-wrap"]').text
except:
batch_text = self.driver.find_element_by_xpath('//div[@class="mod-detail-info-minimum"]').text # 伙拼价
# 邮费
try:
postage = self.driver.find_element_by_xpath('//div[@class="obj-carriage"]').text.replace('¥', '').split('\n')[1]
except:
postage = 0
# 商品轮播图
goods_img_list = []
goods_imgs = self.driver.find_elements_by_xpath('//ul[@class="nav nav-tabs fd-clr"]/li/div/a/img')
for goods_img in goods_imgs:
goods_img_list.append(goods_img.get_attribute('src').replace('.60x60', ''))
# 商品详情
goods_desc = self.driver.find_element_by_xpath('//div[@id="mod-detail-attributes"]').text
# 分销页或代发页
try:
tag_click = self.driver.find_element_by_xpath('//li[@class="trade-type-has-tip trade-tab-long-text trade-tab-name-consign"]')
tag_click.click()
except:
return
sale_price_min = sale_price_max = 0
# 分销价格
try:
sale_price_list = self.driver.find_element_by_css_selector('.price-val')
sale_price_text = sale_price_list.text.replace('¥', '').replace(' ', '')
if '-' in sale_price_text:
sale_price_min = sale_price_text.split('--')[0]
sale_price_max = sale_price_text.split('--')[1]
else:
sale_price_min = sale_price_max = sale_price_text
except:
pass
# 代发价格
try:
sale_price_list = self.driver.find_element_by_css_selector('.price-content')
sale_price_text = sale_price_list.text.split(' ')[0].replace('¥ ', '')
if ' ' in sale_price_text:
sale_price_min = sale_price_text.split(' ')[0]
sale_price_max = sale_price_text.split(' ')[1]
else:
sale_price_min = sale_price_max = sale_price_text
except:
pass
defaults = {
'title': goods_name,
'desc': goods_desc,
'banner_images': goods_img_list,
'detail_images': [],
'cash_price_min': cash_price_min,
'cash_price_max': cash_price_max,
'batch_text': batch_text,
'sale_price_min': sale_price_min,
'sale_price_max': sale_price_max,
'postage': postage,
}
filter_kwargs = {
'item_id': item_id
}
print('商品信息爬取结束,数据入库')
print(defaults, filter_kwargs)
# 商品sku信息
try:
sku_text = self.driver.find_element_by_xpath('//table[@class="table-sku"]')
try:
# sku 带图
sku_name = self.driver.find_element_by_xpath('//div[@class="box-img"]/img').get_attribute('alt')
sku_price = sku_text.text.split('\n')[0].split(' ')[0].replace('元', '')
sku_desc = sku_text.text.split('\n')[0].split(' ')[1]
except:
# sku 不带图
sku_name = sku_text.text.split('\n')[0].split(' ')[0]
sku_price = sku_text.text.split('\n')[0].split(' ')[1].replace('元', '')
sku_desc = sku_text.text.split('\n')[0].split(' ')[2]
html = self.driver.page_source
json_pattren = re.compile(f'{sku_name}.*?skuId":(.*?),', re.S)
sku_id = re.findall(json_pattren, html)[0].replace('}', '')
defaults = {
'item_id': item_id,
'name': sku_name,
'price': sku_price,
'sku_desc': sku_desc,
}
filter_kwargs = {
'sku_id': sku_id,
}
print('商品sku信息爬取结束,数据入库')
print(defaults, filter_kwargs)
except:
pass
因为淘宝查看所有商品是需要登录的,所以,在爬取前需要跳过登录检测,网上方法挺多,我用的最简单的,扫码,其他的可以自己百度一下!
然后为了保存登录信息,使用终端起一个固定的浏览器窗口,命令/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --user-data-dir=/Users/xy/Downloads/tmp/chrome_data
记得设置'remote_debug': True
,具体使用可以百度哈,我也是第一次用=.=
第一次爬淘宝,最近都不想打开淘宝页面了,,,文章是用来记录的,如果侵权了可以告诉我删除哈,溜了溜了~