爬虫升级版

学习爬虫也有段时间了,这次把一个月前写的爬虫给更新了一下加入多线程,其它复杂的地方也做了简化思路清晰一些了。
首先导入一些库

import requests
import re
from os import path,mkdir
from random import choice
from time import time,sleep
from bs4 import BeautifulSoup
from uuid import uuid4
from threading import Thread
from queue import Queue
from lxml.html import fromstring

接着是,获取用户输入的网页url,做些简单的判断,在将url列表传入main函数中

if __name__=='__main__':
    dreferer='https://www.taobao.com/?spm=a230r.7195193.1581860521.1.566dfeb6APAvF9'
    
    thread_max=10
    q = Queue()
    
    while True:
        urls = input('请输入待爬取网址<url之间用逗号隔开>:')

        if urls =='0':
            break
        elif 'set' in urls:
            try:
                thread_max = eval(urls[urls.find('t')+1:])
            except:
                print('输入错误请重试')
                continue
        elif 'all' in urls:
            urls = urls.replace(':',':').replace(',',',')
            urls = getallgoods(*urls[urls.find(':')+1:].strip().split(','))
        else:
            try:
                urls = urls.replace(',',',').split(',')
                aurls=[]
                for u in urls:
                    aurls.append(PageUrl(u,dreferer))
                urls=aurls
            except:
                print('输入有误请重试')
                continue
        
        main(urls)

在main函数中将商品页面的主要信息先获取下来,将图片的url装填到一个队列中,由于淘宝的反爬,我构造了url对象,具有url和reference属性,最后用多线程来获取队列中的任务

def main(url):
    global q
    
    for url in urls:
        sleep(1)
        try:html = gethtml(url.url,ref=url.ref).text
        except:continue

        
        soup = BeautifulSoup(html,'html.parser')
        #商品标题
        title = soup.find('h3' ,class_="tb-main-title").attrs['data-title']

        root = './%s'%title+str(time())[-4:]

        if  not path.exists(root) :
            mkdir(root)
           
        if  not path.exists(root+'/图片') :
            mkdir(root+'/图片')
            
        if  not path.exists(root+'/图片/首图') :
            mkdir(root+'/图片/首图')
            
        if  not path.exists(root+'/图片/颜色') :
            mkdir(root+'/图片/颜色')
            
        if  not path.exists(root+'/图片/详情图') :
            mkdir(root+'/图片/详情图')
            
        if  not path.exists(root+'/详情') :
            mkdir(root+'/详情')


        
        
        
        #获取详情
        try:
            detail = soup.find('ul',class_='attributes-list').text
            with open(root+'/详情/详情.txt','w',encoding='utf-8') as f:
                f.write(detail)
                f.close()
                
            print('详情解析成功!')
        except:
            print('详情解析失败!')
        #获取首图
        try:
            
            jpgs = soup.find('ul', id="J_UlThumb", class_="tb-thumb tb-clearfix")
                
            for each in jpgs('img'):
                if 'http' in each.attrs['data-src']:
                    url_jpg = each.attrs['data-src'].replace('50x50','400x400')
                else:
                    url_jpg = 'http:'+each.attrs['data-src'].replace('50x50','400x400')
                u = Picture(url_jpg,'首图',root+'/图片/首图/'+str(uuid4())+url_jpg[-4:])
                q.put(u)
          
                
                    
            print('主图解析成功!')
        except:
            print('主图解析失败')
        #获取颜色
        try:
            color_list=[]
            colors = soup.find('ul',class_="J_TSaleProp tb-img tb-clearfix")
            
            for each in colors('a'):
                try:
                    if 'http' in each.attrs['style']:
                        color_list.append(each.attrs['style'].replace('jpg_30x30','jpg_400x400').replace('background:url(','').replace(') center no-repeat;',''))
                    else:
                        color_list.append('http:'+each.attrs['style'].replace('jpg_30x30','jpg_400x400').replace('background:url(','').replace(') center no-repeat;',''))
                except: pass
            print('颜色展示解析成功!')
        except:
            print('颜色展示解析失败!')
        
        for u in color_list:
            u = Picture(u,'颜色',root+'/图片/颜色/'+str(uuid4())+u[-4:])
            q.put(u)
            
        #获取描述图片
        try:
            find_desc = re.compile(r'descUrl[ ]*:[^:]*:[^:]*:[ ]*\'([^\']*)\',')
            desc_js = 'https:'+find_desc.search(html,re.M).group(1)
            desc_html = gethtml(desc_js,ref=url.url)
            find_dp = re.compile('src=\"(.*?\.jpg)\"')
            
            for u in find_dp.findall(desc_html.text):
                u = Picture(u,'详情图',root+'/图片/详情图/'+str(uuid4())+u[-4:])
                q.put(u)
            
            print('描述图获取成功')
        except:
            print('描述图获取失败')
        
       
        #获取视频
        try:
            find_videoid=re.compile(r'"videoId":"\d{5,20}"')
            videoid = find_videoid.search(html).group(0)[11:-1]
            find_ownerid=re.compile(r'"videoOwnerId":"\d{5,20}"')
            ownerid = find_ownerid.search(html).group(0)[16:-1]
            videolink = 'https://cloud.video.taobao.com/play/u/%s/p/1/e/6/t/1/%s.mp4'%(ownerid,videoid)
            u = Picture(videolink,'视频',root+'/图片/首图/主图视频'+videolink[-4:])
            q.put(u)
            print('视频解析成功!')
        except:
            print('视频解析成功!')
            

    print('资源队列填充完毕!')
    threads=[]
    for t in range(thread_max):
        t = MyThread('{}号线程'.format(t+1),threadfunc)
        threads.append(t)
        t.start()
    print('线程数{}'.format(thread_max))
        
    for t in threads:
        t.join()
        print('{}退出!'.format(t.name))

main中引用的函数如下

def gethtml(url,ref=None):
    print('Download',url)

    
    user_agents=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
         'Mozilla/5.0',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
              ]
    

    kv={
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'pragma': 'no-cache',
    'cookie':'lLtC1_=1; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zMG1aAN%2F0SGVHunXe1zncjzZgd4rdD%2FjCPEtps5h1DdNN9ZtGY%2FSXjB6pLKY7JP0duRYcn4QfhdfkaO73LC%2FZ%2BSm52%2BHihM%2BJ4oPPcIVRYd0R1ub7uC%2BaXeI34SoaPZqOZva2hBGHPQL%2BqYrEPq8PfThv54jYWXg%2BrMf165zLHvHzJbKtyKB8W%2BWMcSM8T2W4amIwMx7aaqh6zn90EuKolMosljXNUYIzp9k489WSmONfQRl2c4pI2hCLL5x87g1Sb2V75hIfp%2FyCFn1yQSLjFKglhwAJsc7ZDJO9mGKLf685wwmJtZqnd4kkSRMCaDmauW6eGMo3B2mKZ; _samesite_flag_=true; cookie2=12e97f3971fbc06584c7bdd035a19bc2; t=6b54384541f545975c14314b6ce2c499; _tb_token_=ee71e6f7e1eb5; enc=ICQVf4fgltTYaOxxdyGVpl2Brpo%2BqgAIdqDRdCux01c2Nrqk3%2FhIOl6aUfkKKWz%2FY%2Fiu%2FP9B%2F9l7xX9MK82npg%3D%3D; _m_h5_tk=503a8421e104d76703098a189a305df6_1591066051640; _m_h5_tk_enc=b45c31ef6c8fe607dc0e9462967140bd; tfstk=cQklBAGNZbP5XWPm1LwSxDLp3YAOZEszP5Fq3YgW79a-aSMViZ1V_lLVPruop61..; sgcookie=E6PHmdt%2FPsAEYwnepVjhZ; mt=ci=0_0; tracknick=; cna=2G1PFxSAs0ACAXufbrvks+6p; v=0; isg=BJCQS95b-sb2Y6bCmFcDX5hVYd7iWXSjZsQe0Iph0-uhxTBvMmjWM7n8nY0lFSx7; l=eBN5X7qRQ0UfCVe0BO5ZPurza77OiIRb4sPzaNbMiInca1rV1UjRnNQDFuODRdtjgtCe5etPOzL1BRLHR3Ap9xDDBYFinQFE3xvO.',
    'referer': ref or dreferer,
    'sec-fetch-dest': 'script',
    'sec-fetch-mode': 'no-cors',
    'sec-fetch-site': 'same-site',
    'upgrade-insecure-requests': '1',
    'User-Agent':choice(user_agents)
        }
    
    
    try:
        r = requests.get(url,headers=kv,timeout=4)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        
            
    
        
       
        
    except:
        print('Download error')
        r=None
    
        
    return r

class MyThread(Thread):
    def __init__(self,name,func):
        super().__init__()
        self.name = name
        self.func = func
    def run(self):
        self.func()

        
        
class Picture():
    def __init__(self,url,attr,pos):
        self.attr = attr
        self.url = url
        self.pos = pos
        
class PageUrl():
    def __init__(self,url,ref):
        self.url = url
        self.ref = ref

        
def threadfunc():
    global q
    while not q.empty():
        url = q.get()
        try:
            with open(url.pos,'wb') as f:
                r = gethtml(url.url)
                f.write(r.content)
                f.close()

            print('一个{}下载完毕'.format(url.attr))
            
        except Exception as e:
            print(e)
    
        del url



def getallgoods(url,refer):
    global dreferer
    dreferer = refer
    aurls=[]
    page = 1
    mpage=1
    while page<=mpage:
        if 'pageNo' in url:
            
            res = gethtml(url[:-2]+str(page))
        else:
           
            res = gethtml('{}&pageNo={}'.format(url,page))
        ref = refer+'&pageNo={}'.format(page)
        find_page = re.compile(r'(\d)/(\d)')
        mpage = eval(find_page.search(res.text).group(2))
    
        find_id = re.compile(r'sellerId=(\d+)&itemIds=([^&]+)&')
        itemIds  = find_id.search(res.text).group(2).split(',')
        
        for i in itemIds:
            u = 'https://item.taobao.com/item.htm?id={}'.format(i)
            aurls.append(PageUrl(u,ref))
           
        page+=1
    return aurls


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值