多线程执行爬虫

1、多线程爬虫

改写猫眼电影爬虫:

import json
import re
import time

import requests
from colorama import Fore
from fake_useragent import UserAgent
from requests import HTTPError
import lxml
from lxml import etree

def download_page(url,params=None):
    try:
        ua = UserAgent()
        headers = {'User-Agent':ua.random}
        response = requests.get(url,params = params,headers=headers)
    except HTTPError as e:
        print(Fore.RED + '[-]爬取网站%s失败:%s' %(url,str(e)))
        return None
    else:
        return response.text

def parse_html(html):
    """
    通过正则表达式对html解析获取电影名称、时间、评分、图片等信息
    :param html:
    :return:
    """
    # 通过使用正则表达式解析页面
    # pattern = re.compile(
    #     '<dd>'
    #     + '.*?<i class="board-index.*?">(\d+)</i>' # 获取电影的排名<i class="board-index board-index-1">1</i>
    #     + '.*?<img data-src="(.*?)" alt="(.*?)" class="board-img" />'
    #     + '.*?<p class="star">(.*?)</p>'  # 获取电影主演信息<p class="star">主演:葛优,巩俐,牛犇</p>
    #     + '.*?<p class="releasetime">(.*?)</p>' # 获取上映时间<p class="releasetime">上映时间:1994-05-17(法国)</p>
    #     '.*?</dd>',
    #     re.S
    # )
    # # findall返回列表,finditer返回迭代器
    # items = re.finditer(pattern,html)

    # 通过是利用XPath解析页面
    # 1)将传入的html文档内容,通过lxml解析器进行解析
    html = etree.HTML(html)
    # 2)使用XPath语法获取电影信息
    # <dl class="board-wrapper">从根结点找到属性名为board-wrapper的dl标签,拿出里面的dd标签
    movies = html.xpath('//dl[@class="board-wrapper"]/dd')
    for movie in movies:
        # 从当前dd节点寻找i标签,并获取i标签的内容
        index = movie.xpath('./i/text()')[0]
        # print(index)
        image = movie.xpath('.//img[@class="board-img"]/@data-src')[0]    # <img data-src="https://p0.meituan.net/movie/4c41068ef7608c1d4fbfbe6016e589f7204391.jpg@160w_220h_1e_1c" alt="活着" class="board-img" />
        title = movie.xpath('.//img[@class="board-img"]/@alt')[0]    # <img data-src="https://p0.meituan.net/movie/4c41068ef7608c1d4fbfbe6016e589f7204391.jpg@160w_220h_1e_1c" alt="活着" class="board-img" />
        actors = movie.xpath('.//p[@class="star"]/text()')[0]    # <p class="star">主演:葛优,巩俐,牛犇</p>
        add_time = movie.xpath('.//p[@class="releasetime"]/text()')[0] # <p class="releasetime">上映时间:1994-05-17(法国)</p>
        yield {
            'index': index,
            'image': image,
            'title': title,
            'actors': actors.strip().lstrip('主演:'),
            'add_time':add_time.lstrip('上映时间:')
        }

def save_to_json(data,filename):
    """将爬取的数据信息写入json文件中"""
    import codecs # 可以直接指定文件的编码格式为UTF-8
    # with open(filename,'a') as f:
    #     f.write(json.dumps(data,ensure_ascii=False,indent=4))
    #     print(Fore.GREEN + '[+] 保存电影%s成功' %(data['title']))
    with codecs.open(filename,'a','utf-8') as f:
        f.write(json.dumps(data,ensure_ascii=False,indent=4) + '\n')

def get_one_page(page=1):
    # url = 'https://maoyan.com/board/4'     # 爬取一页内容
    url = 'https://maoyan.com/board/4?offset=%s' %((page-1)*10)
    html = download_page(url)
    # print(html)
    items = parse_html(html)
    print(Fore.GREEN + '[+] 采集[%s]页数据' % (page))
    for item in items:
        print(item)
        save_to_json(item,'maoyan.json')

def no_use_thread():
    for page in range(1,11):
        get_one_page(page)
        print(Fore.GREEN + '[+] 采集[%s]页数据' %(page))
        time.sleep(1)
def use_multi_thread():
    # 使用多线程实现的代码
    from threading import Thread
    threads = []
    for page in range(1, 11):
        thread = Thread(target=get_one_page, args=(page,))
        thread.start()
        print(Fore.GREEN + '[+] 采集第[%s]页数据' % (page))
        threads.append(thread)
    [thread.join() for thread in threads]
    print(Fore.GREEN + '采集数据完成')
if __name__ == '__main__':
    # 使用线程池实现多线程
    from concurrent.futures import ThreadPoolExecutor
    # 实例化线程池并指定线程池中线程的个数
    pool = ThreadPoolExecutor(100)
    pool.map(get_one_page,range(1,11))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值