Python并发爬虫

爬取

爬去某妹子网,只爬去四分类妹子第一页中的所有,let’go!
页面分析什么的我也懒得说,这是爬虫最基础的东西,我爬这个网就没有反爬措施,只是容易封IP,稍微注意点也没啥大问题。

唯一的反爬措施

其实大多时候你去爬取这个网站信息很简单,只有最后一步你可能翻车,下载图片时,它有一个反爬链,不过你在请求头里面加上就行。

headers = {
        "User-Agent": ua.firefox,
        'Referer': 'http://i.meizitu.net' #反扒链
    }

下面直接上代码

import concurrent
import os
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
from fake_useragent import UserAgent
import time
import random

ua = UserAgent()
page = 1
picname = 1


def mk_fenleidir_url():
    headers = {
        "User-Agent": ua.firefox,
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
    }
    res = requests.get("https://www.mzitu.com/", headers=headers).text
    html = etree.HTML(res)
    title = html.xpath('//*[@id="menu-nav"]/li/a/text()')[1:5]
    urls = html.xpath('//*[@id="menu-nav"]/li/a/@href')[1:5]
    for i in range(len(title)):
        yield title[i], urls[i]


def intotime(url):
    global page
    headers = {
        "User-Agent": ua.firefox,
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
    }
    res = requests.get(url, headers=headers).text
    html = etree.HTML(res)
    #获取图片标题
    pictitle = html.xpath('//*[@id="pins"]/li/span/a/text()')
    #获取图片第二层链接
    pichref = html.xpath('//*[@id="pins"]/li/span/a/@href')
    for i in range(len(pictitle)):
        yield pictitle[i], pichref[i]


def download_pic(dir, picdir, url):
    global picname
    a = random.randint(1, 3)
    headers = {
        "User-Agent": ua.firefox,
        'Referer': 'http://i.meizitu.net'
    }

    res = requests.get(url, headers=headers).text
    html = etree.HTML(res)
    nextpichref = html.xpath('/html/body/div[2]/div[1]/div[4]/a/@href')
    nextpictext = html.xpath('/html/body/div[2]/div[1]/div[4]/a/span/text()')
    nexttext = nextpictext[len(nextpictext) - 1]
    picdownload = html.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')
    pic = requests.get(picdownload[0], headers=headers).content
    if not os.path.exists('./meizitu/%s/%s/%s.jpg' % (dir, picdir, picname)):
        with open('./meizitu/%s/%s/%s.jpg' % (dir, picdir, picname), 'wb')as f:
            f.write(pic)
#通过nexttext内容判断是否继续爬去下一页,及下一页是否为这个图集内容
    if nexttext == "下一页»":
        picname += 1
        nexturl = nextpichref[len(nextpichref) - 1]
        #使用concurrent.futures模块创建线程
        with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
            exector.submit(download_pic, dir, picdir, nexturl)
            time.sleep(a)


def download():
    for i in mk_fenleidir_url():
        os.makedirs('./meizitu/' + i[0])
        for page in intotime(i[1]):
            os.makedirs('./meizitu/%s/' % (i[0]) + page[0])
            download_pic(i[0], page[0], page[1])


if __name__ == '__main__':
    download()
=======================================
# 多协程爬虫
from bs4 import BeautifulSoup
import concurrent
from concurrent.futures import ThreadPoolExecutor
import requests, re, aiohttp, asyncio, time, aiofiles, os
from fake_useragent import UserAgent

"""
用bs4解析妹子网并下载图片,https://www.mzitu.com/japan/
"""
url = "https://www.mzitu.com/japan/"

list1 = list()
a = 1


def geturls(url):
    global a
    ua = UserAgent()
    headers = {
        "User-Agent": ua.random
    }
    html = requests.get(url=url, headers=headers).text
    # print(html)
    soup = BeautifulSoup(html, "lxml")
    pageurls = soup.select('ul[id="pins"]> li > span>a')

    nextpage = soup.select('a[class="next page-numbers"]')

    # print(nextpage)
    for i in nextpage:
        if a <= 3:
            a += 1
            geturls(i.get('href'))

    for url in pageurls:
        # print(url.get('href'))
        # print(url.get_text())
        list1.append((url.get('href'), url.get_text()))


def pagedetail(url):
    ua = UserAgent()
    headers = {
        "User-Agent": ua.random
    }
    html = requests.get(url=url, headers=headers).text
    # print(html)
    soup = BeautifulSoup(html, "lxml")
    nexturl = soup.select('div[class="pagenavi"]>a>span')[-2]
    print(int(nexturl.string) + 1)
    for i in range(1, int(nexturl.string) + 1):
        yield "".join((url, "/%s" % (i)))


def picurl(pageurl):
    ua = UserAgent()
    headers = {
        "User-Agent": ua.random,
        'Referer': 'http://i.meizitu.net'
    }
    html = requests.get(url=pageurl, headers=headers).text
    # print(html)
    soup = BeautifulSoup(html, "lxml")
    imageurl = soup.select('div[class="main-image"]>p>a>img')[0]
    url = imageurl.get('src')
    yield url


async def download_pic(picdir, url):
    ua = UserAgent()
    headers = {
        "User-Agent": ua.random,
        'Referer': 'http://i.meizitu.net'
    }
    name = re.findall('https://i3.mmzztt.com/\d+/\d+/(\w+).jpg', url)
    sem = asyncio.Semaphore(100)
    with (await sem):
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=headers)as resp:
                ym = resp.read()
                async with aiofiles.open('./meizitu/%s/%s.jpg' % (picdir, name[0]), 'wb')as f:
                    await f.write(ym)
            await session.close()


def download():
    geturls(url)
    for dirs in list1:
        os.makedirs('./meizitu/' + dirs[1])
        for pageurl in pagedetail(dirs[0]):
            tasks = [asyncio.ensure_future(download_pic(dirs[1],urls)) for urls in picurl(pageurl)]
            loop = asyncio.get_event_loop()
            loop.run_until_complete(asyncio.wait(tasks))


if __name__ == '__main__':
    download()
    # with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
    #     exector.submit(download)

爬完后的效果图

图片我就不发了,我发了半天给我来句审核为通过色情???我就发了几个文件夹结构图片就成色情,我想笑,我还没发图片截图,哈哈!

问题

写着写着发现耦合性太高,也不想再管了,图片名称问题,以数字1开始,每个主题中都应该是1开始,后来发现我忽略在循环中的问题,懒得改了。

这个爬虫的目的

写这个爬虫demo主要这只是为了验证concurrent.futures创建线程的方法,所以其他都不是我的目的。我也不谈这个模块的底层原理,因为我自己没研究源码!哈哈哈。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值