Python多线程爬虫

from gevent import monkey, joinall, spawn

monkey.patch_all()
import requests
import re
import os

BASE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'meizitu2')
girl_list = []


def save_imgs(name, url):
    name = name.replace('?', '')
    name = name.replace(':', ' ')
    if not os.path.exists(os.path.join(BASE_DIR, name)):
        os.mkdir(os.path.join(BASE_DIR, name))
        print('create path', os.path.join(BASE_DIR, name))
    res = requests.get(url)
    source = re.search(r'img src="(.*?)"', res.text).group(1)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ,
        'cache - control': 'no - cache',
        'pragma': 'no - cache',
        'upgrade - insecure - requests': '1',
        'Referer': "https://www.mzitu.com/1",
    }
    with open(os.path.join(BASE_DIR, name, source.split('/')[-1]),'wb') as f:
        f.write(requests.get(source, headers=headers, timeout=3).content)
    print('download successful:', source)


def get_girl_pics(url):
    res = requests.get(url)
    last_page = re.findall(r'<span>(\d+)</span>', res.text)[-1]
    title = re.search(r'<h2 class="main-title">(.*?)</h2>', res.text).group(1)
    save_imgs(title, url)
    for i in range(2, int(last_page) + 1):
        save_imgs(title, ''.join([url, '/', str(i)]))

def get_all_grils(url):
    global girl_list
    res = requests.get(url)
    pages = re.findall(r'<li><a href="(.*?)" target="_blank">',res.text)
    for i in pages:
        girl_list.append(i)


def get_url_lists():
    url = 'https://www.mzitu.com/mm'
    url_pages = [url]
    res = requests.get(url)
    girl_pages = re.findall(r'</span>(.*?)<span class="meta-nav', res.text)
    for i in range(2, int(girl_pages[-1]) + 1):
        url_pages.append(''.join([url, 'page/', str(i)]))
    print(url_pages)
    joinall([spawn(get_all_grils, i) for i in url_pages])


if __name__ == '__main__':
    get_url_lists()
    joinall([spawn(get_girl_pics, i) for i in girl_list])

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值