# from fake_useragent import UserAgent
from ua import UA
import gevent
from gevent import monkey
monkey.patch_all()
import requests
from lxml import etree
from urllib.parse import urlparse,parse_qs
from time import time
import re
from uuid import uuid4
from os import path,mkdir
import json
from random import choice
class Req:
def __init__(self,method,url,headers=None,params=None,data=None,encoding='utf-8',save_id=0,name=None):
self.method = method
self.url = url
self.headers = headers
self.params = params
self.data = data
self.encoding = encoding
self.save_id = save_id
self.name = name
class Taobao:
__instance=None
def __new__(cls,*args,**kwargs):
if not cls.__instance:
cls.__instance = object.__new__(Taobao)
return cls.__instance
def __init__(self,urls:list):
# self.ua = UserAgent()
self.ua = UA()
self.reqs=[]
self.shop_url=[]
self.item_url=[]
for u in urls:
if 'shop' in u:
self.shop_url.append(u)
elif 'item' in u:
self.item_url.append(u)
def download(self,request:Req):
try:
r = requests.request(method=request.method,
url=request.url,
headers=request.headers.update({"User-Agent":choice(self.ua)}),
params=request.params,
data=request.data
)
r.raise_for_status()
r.encoding = request.encoding or r.apparent_encoding
except:
return None
return r
def parse_item(self):
for num,u in enumerate(filter(None,self.item_url),1):
r = self.download(Req('GET',u,headers={'referer':'https://www.taobao.com/?spm=a230r.7195193.1581860521.1.348a5f4eDYoRiN'},encoding=None))
if not r:
print(f'id_{num} 页面获取失败')
continue
tree = etree.HTML(r.text)
#商品名
title = tree.xpath('//h3[@class="tb-main-title"]/@data-title')[0].strip()
root = f'./{title}_{int(time()%1000)}'
if not path.exists(root):
mkdir(root)
#主图
try:
pic1 = tree.xpath('//ul[@id="J_UlThumb"]//img/@data-src')
#print('主图',pic1)
pic1 = map(lambda p:Req("GET",'https:'+re.sub(r'_\d{2,3}x\d{2,3}\..*','',p),headers={'referer':u},save_id=1),pic1)
pic1 = [gevent.spawn(self.get_req,r0,root) for r0 in pic1]
print(f'id_{num} 获取主图')
except Exception as e:
print(f'id_{num} 主图获取失败\n{e}')
pic1=[]
#视频
try:
d = json.loads(re.search(r"Hub.config.set\('video', (\{[^\}]*\})\)",r.text).group(1))
videolink = 'https://cloud.video.taobao.com/play/u/%s/p/1/e/6/t/1/%s.mp4'%(d['videoOwnerId'],d['videoId'])
gevent.spawn(self.get_req,Req("GET",videolink,headers={'referer':u},save_id=4),root).join()
except:
pass
#颜色图
try:
pic2=[]
for p in tree.xpath('//ul[@class="J_TSaleProp tb-img tb-clearfix"]/li/a'):
name = p.xpath('./span/text()')[0].strip()
url = 'https:'+re.search(r'(//.*jpg)_\d{2}x\d{2}',p.xpath('./@style')[0]).group(1)
r0 = Req("GET",url,headers={'referer':u},save_id=2,name=name)
pic2.append(gevent.spawn(self.get_req,r0,root))
print(f'id_{num} 获取颜色图')
except Exception as e:
print(f'id_{num} 颜色图片获取失败\n{e}')
pic2=[]
#文字说明
try:
price = ''.join(tree.xpath('//*[@id="J_StrPrice"]//text()'))
text = ''.join(tree.xpath('//ul[@class="attributes-list"]//li/text()'))
with open(f'{root}/描述.txt','w',encoding='utf-8') as f:
f.write(text)
print(f'id_{num} 获取文本')
except Exception as e:
print(f'id_{num} 文本获取失败\n{e}')
#详情图
try:
l = re.search(r'descUrl[ ]*:[^:]*:[^:]*:[ ]*\'([^\']*)\',',r.text,re.S).group(1).replace('\n','')
r2 = self.download(Req("GET",'https:'+l,headers={'referer':u},encoding=None))
pic3 = map(lambda p:Req("GET",p,headers={'referer':u},save_id=3) ,re.findall(r'src=\"(.*?\.jpg)\"',r2.text))
#print('详情图',re.findall(r'src=\"(.*?\.jpg)\"',r2.text))
pic3 = [gevent.spawn(self.get_req,r0,root) for r0 in pic3]
print(f'id_{num} 获取详情图')
except Exception as e:
print(f'id_{num} 详情图获取失败 \n{e}')
pic3=[]
gevent.joinall(pic1+pic2+pic3)
def get_req(self,r:Req,root):
print(r.url)
res = self.download(r)
if r.save_id ==1 or r.save_id==4 :
if not path.exists(f'{root}/主图'):
mkdir(f'{root}/主图')
with open(f'{root}/主图/{uuid4() if r.save_id==1 else "视频" }{r.url[-4:]}','wb') as f:
f.write(res.content)
elif r.save_id ==2:
if not path.exists(f'{root}/颜色分类'):
mkdir(f'{root}/颜色分类')
with open(f'{root}/颜色分类/{r.name or uuid4()}{r.url[-4:]}','wb') as f:
f.write(res.content)
elif r.save_id ==3:
if not path.exists(f'{root}/详情图'):
mkdir(f'{root}/详情图')
with open(f'{root}/详情图/{uuid4()}{r.url[-4:]}','wb') as f:
f.write(res.content)
del r,res
if __name__=="__main__":
while True:
tb = Taobao(input('请输入链接:').strip().replace(' ',',').replace(',',',').split(','))
if tb.item_url:
st = time()
tb.parse_item()
print(f'程序耗时{time()-st}s')
print('='*30)
gevent爬虫案例
最新推荐文章于 2024-05-25 14:50:16 发布