python爬虫学习(二十)异步爬虫、线程池

# import time
# #使用单线程串行的方式执行
# def get_page(str):
#     print("正在下载:",str)
#     time.sleep(2)
#     print('下载成功:',str)
# name_list = ['xiaozi','aa','bb','cc']
# start_time = time.time()
#
# for i in range(len(name_list)):
#     get_page(name_list[i])
#
# end_time =time.time()
# print('%d second'% (end_time-start_time))


# import time
# #导入线程池模块对应的类
# from multiprocessing.dummy import Pool
# #使用线程池的方式执行
# start_time = time.time()
# def get_page(str):
#     print("正在下载:",str)
#     time.sleep(2)
#     print('下载成功:',str)
#
# name_list = ['xiaozi','aa','bb','cc']
#
# #实例化一个线程池对象
# pool = Pool(4)
# #将列表中每一个列表元素传递给get_page进行处理
# pool.map(get_page,name_list)
#
# end_time = time.time()
# print(end_time-start_time)

import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}

#原则:线程池处理的是阻塞且好事的操作
#对下述uel发起请求解析出现的详情页url和视屏的名称
url = 'https://www.pearvideo.com/category_5'
page_text =  requests.get(url=url,headers= headers).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
#存储所有视屏的链接和名字
urls =[]
for li in li_list:
    detail_url = 'https://www.pearvideo.com/'+ li.xpath('./div/a/@href')[0]
    name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
    #print(detail_url,name)
    #对详情页url请求
    detail_page_text = requests.get(url=detail_url,headers=headers).text
    #从详情页中解析数据
    #var contId="1659808",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo.com/mp4/adshort/20200309/cont-1659808-11499143-210417_adpkg-ad_hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com";
    ex = 'srcUrl="(.*?)",vdoUrl'
    video_url = re.findall(ex,detail_page_text)[0]
    #print(video_url)
    dic ={
        'name' :name,
        'url':video_url
    }
    urls.append(dic)

def get_video_date(dic):
    url = dic['url']
    print(dic['name'], '正在下载')
    data = requests.get(url=url,headers= headers).content
    #持久化存储
    with open(dic['name'],'wb') as fp:
        fp.write(data)
        print(dic['name'],'下载成功')
#使用线程池对视屏数据进行请求
pool = Pool(4)
pool.map(get_video_date,urls)

pool.close()
pool.join()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值