gevent爬虫案例

# from fake_useragent import UserAgent
from ua import UA
import gevent
from gevent import monkey
monkey.patch_all()
import requests
from lxml import etree
from urllib.parse import urlparse,parse_qs
from time import time
import re
from uuid import uuid4
from os import path,mkdir
import json
from random import choice


class Req:
    def __init__(self,method,url,headers=None,params=None,data=None,encoding='utf-8',save_id=0,name=None):
        self.method = method
        self.url = url
        self.headers = headers 
        self.params = params
        self.data = data
        self.encoding = encoding
        self.save_id = save_id
        self.name = name
        


class Taobao:
    __instance=None
    def __new__(cls,*args,**kwargs):
        if not cls.__instance:
            cls.__instance = object.__new__(Taobao)
        return cls.__instance

    def __init__(self,urls:list):
        # self.ua = UserAgent()
        self.ua = UA()
        self.reqs=[]
        self.shop_url=[]
        self.item_url=[]
        for u in urls:
            if 'shop' in u:
                self.shop_url.append(u)
            elif 'item' in u:
                self.item_url.append(u)

    def download(self,request:Req):
        try:
            r = requests.request(method=request.method,
            url=request.url,
            headers=request.headers.update({"User-Agent":choice(self.ua)}),
            params=request.params,
            data=request.data
            )
            r.raise_for_status()
            r.encoding = request.encoding or r.apparent_encoding
        except:
            
            return None
        return r

    def parse_item(self):
        for num,u in enumerate(filter(None,self.item_url),1):
            
            r = self.download(Req('GET',u,headers={'referer':'https://www.taobao.com/?spm=a230r.7195193.1581860521.1.348a5f4eDYoRiN'},encoding=None))
            if not r:
                print(f'id_{num} 页面获取失败')
                continue
            tree = etree.HTML(r.text)
            
            #商品名
            title = tree.xpath('//h3[@class="tb-main-title"]/@data-title')[0].strip()
            root = f'./{title}_{int(time()%1000)}'
            if not path.exists(root):
                mkdir(root)
            
            #主图
            try:
                pic1 = tree.xpath('//ul[@id="J_UlThumb"]//img/@data-src')
                #print('主图',pic1)
                pic1 = map(lambda p:Req("GET",'https:'+re.sub(r'_\d{2,3}x\d{2,3}\..*','',p),headers={'referer':u},save_id=1),pic1)
                
                pic1 = [gevent.spawn(self.get_req,r0,root) for r0 in pic1]
                print(f'id_{num} 获取主图')
            except Exception as e:
                print(f'id_{num} 主图获取失败\n{e}')
                pic1=[]
            #视频
            try:
                d = json.loads(re.search(r"Hub.config.set\('video', (\{[^\}]*\})\)",r.text).group(1))
                videolink = 'https://cloud.video.taobao.com/play/u/%s/p/1/e/6/t/1/%s.mp4'%(d['videoOwnerId'],d['videoId'])
                gevent.spawn(self.get_req,Req("GET",videolink,headers={'referer':u},save_id=4),root).join()
            except:
                pass
            #颜色图
            try:
                pic2=[]
                for p in tree.xpath('//ul[@class="J_TSaleProp tb-img tb-clearfix"]/li/a'):
                    name = p.xpath('./span/text()')[0].strip()
                    url = 'https:'+re.search(r'(//.*jpg)_\d{2}x\d{2}',p.xpath('./@style')[0]).group(1)
                    r0 = Req("GET",url,headers={'referer':u},save_id=2,name=name)
                    pic2.append(gevent.spawn(self.get_req,r0,root))
                
                print(f'id_{num} 获取颜色图')
            except Exception as e:
                print(f'id_{num} 颜色图片获取失败\n{e}')
                pic2=[]
            #文字说明
            
            try:
                price = ''.join(tree.xpath('//*[@id="J_StrPrice"]//text()'))
                
                text = ''.join(tree.xpath('//ul[@class="attributes-list"]//li/text()'))
                with open(f'{root}/描述.txt','w',encoding='utf-8') as f:
                    f.write(text)
                print(f'id_{num} 获取文本')
            except Exception as e:
                print(f'id_{num} 文本获取失败\n{e}')
            #详情图
            
            try:
                l = re.search(r'descUrl[ ]*:[^:]*:[^:]*:[ ]*\'([^\']*)\',',r.text,re.S).group(1).replace('\n','')
                
                
                r2 = self.download(Req("GET",'https:'+l,headers={'referer':u},encoding=None))
                
                pic3 = map(lambda p:Req("GET",p,headers={'referer':u},save_id=3) ,re.findall(r'src=\"(.*?\.jpg)\"',r2.text))
                
                #print('详情图',re.findall(r'src=\"(.*?\.jpg)\"',r2.text))
                pic3 = [gevent.spawn(self.get_req,r0,root) for r0 in pic3]
                print(f'id_{num} 获取详情图')
            except Exception as e:
                print(f'id_{num} 详情图获取失败 \n{e}')
                pic3=[]
            
            gevent.joinall(pic1+pic2+pic3)

    def get_req(self,r:Req,root):
        print(r.url)
        res = self.download(r)

        if r.save_id ==1 or r.save_id==4 :
            if not path.exists(f'{root}/主图'):
                mkdir(f'{root}/主图')

            with open(f'{root}/主图/{uuid4() if r.save_id==1 else "视频" }{r.url[-4:]}','wb') as f:
                f.write(res.content)
        elif r.save_id ==2:
            if not path.exists(f'{root}/颜色分类'):
                mkdir(f'{root}/颜色分类')

            with open(f'{root}/颜色分类/{r.name or uuid4()}{r.url[-4:]}','wb') as f:
                f.write(res.content)
        elif r.save_id ==3:
            if not path.exists(f'{root}/详情图'):
                mkdir(f'{root}/详情图')

            with open(f'{root}/详情图/{uuid4()}{r.url[-4:]}','wb') as f:
                f.write(res.content)

        del r,res
            








        
            

if __name__=="__main__":
    while True:
        tb = Taobao(input('请输入链接:').strip().replace(' ',',').replace(',',',').split(','))
        
        if tb.item_url:
            st = time()
            tb.parse_item()
            print(f'程序耗时{time()-st}s')
            print('='*30)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值