Python博客爬虫，新浪博客图片异步爬虫

二爷记

于 2021-07-21 19:07:31 发布

阅读量724

点赞数

文章标签：百度 archlinux oa办公 math.h mooc

本文链接：https://blog.csdn.net/minge89/article/details/118981678

版权

身为一个有觉悟的渣渣，永远不会停止爬虫的瞎写（内卷）之路，很久没有coding了，so就有了下面这篇分享，一个博客爬虫，图片爬虫，我们都非常熟悉的新浪博客的图片爬虫，为了体现本渣渣的渣渣（弱智）水平，带来了一个异步版本，供大家参考学习，如果异步玩的6，请带带本渣渣！

异步代码是本渣渣抄袭的，不懂不要问本渣渣，因为本渣渣也不会。。。

目标网址：

http://blog.sina.com.cn/s/articlelist_1462278767_0_1.html

其实前面本渣渣也研究过异步爬虫，可移步查看：

搜狗微信搜索下拉框词采集多线程/异步采集源码公布

Python美女图异步爬虫案例小姐姐我全都要！

百度搜索关键词竞争度大小抓取异步爬虫demo

几个关键点

1.图片Referer反爬

图片如果没有设置，会下载不到想要的图片内容！

    headers={
        "Referer":url,
        "User-Agent":UserAgent().random,
    }

2.图片中高清大图的地址获取

高清大图的链接是需要替换的，这里本渣渣直接用replace替换！

img=img.replace("mw690","orignal").replace("bmiddle","orignal").replace("middle","orignal")

附完整源码参考：

#http://blog.sina.com.cn/s/articlelist_1462278767_0_1.html
#新浪博客文章采集
#20210705 by 微信：huguo00289
# -*- coding: UTF-8 -*-
import requests,time
from fake_useragent import UserAgent
from lxml import etree
import os,re


def ua():
    headers={"User-Agent":UserAgent().random}
    return headers



def get_pagenum():
    num=20
    for i in range(1,num+1):
        print(f">>正在爬取第{i}页数据..")
        url=f"http://blog.sina.com.cn/s/articlelist_1462278767_0_{i}.html"
        html=requests.get(url=url,headers=ua(),timeout=6).content.decode('utf-8')
        time.sleep(8)
        tree=etree.HTML(html)
        hrefs=tree.xpath('//span[@class="atc_title"]/a/@href')
        print(hrefs)
        for href in hrefs:
            try:
                get_imgs(href)
                time.sleep(2)
            except Exception as e:
                print(f"访问出错，错误代码{e}")
                with open("fail_list.txt", 'a+', encoding='utf-8') as f:
                    f.write(f'{href}\n')
                print(f'保存访问失败的图片数据列表链接成功！')


def get_imgs(url):
    html = requests.get(url=url, headers=ua(), timeout=6).content.decode('utf-8')
    time.sleep(6)
    tree = etree.HTML(html)
    h2=tree.xpath('//h2/text()')[0]
    print(h2)
    h2 = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", h2)  # 剔除不合法字符
    path=f'{h2}/'
    os.makedirs(path, exist_ok=True)
    imgs=tree.xpath('//div[@id="sina_keyword_ad_area2"]//img/@real_src')
    print(imgs)
    headers={
        "Referer":url,
        "User-Agent":UserAgent().random,
    }
    i=1
    for img in imgs:
        img=img.replace("mw690","orignal").replace("bmiddle","orignal").replace("middle","orignal")
        r=requests.get(url=img,headers=headers,timeout=6)
        with open(f'{path}{i}.jpg','wb')as f:
            f.write(r.content)
        print("下载图片成功！")
        i=i+1
        time.sleep(1)


if __name__=="__main__":
    get_pagenum()

附异步版本源码参考：

#20210721 by 微信：huguo00289
# -*- coding: UTF-8 -*-

#https://www.52pojie.cn/forum.php?mod=viewthread&tid=1469537&extra=page%3D1%26filter%3Dtypeid%26typeid%3D29
#[Python] 爬取小姐姐写真照的全站异步爬虫，即使设置了反爬我也要爬给你看

import asyncio
import time
import aiohttp
import aiofiles
from lxml import etree
import os
import re
from fake_useragent import UserAgent
from functools import wraps
from asyncio.proactor_events import _ProactorBasePipeTransport


def silence_event_loop_closed(func):
    @wraps(func)  #带wraps装饰器
    def wrapper(self, *args, **kwargs):
        try:
            return func(self, *args, **kwargs)
        except RuntimeError as e:
            if str(e) != 'Event loop is closed':
                raise

    return wrapper


_ProactorBasePipeTransport.__del__ = silence_event_loop_closed(_ProactorBasePipeTransport.__del__)
ua = UserAgent()
headers = {'User-Agent': ua.random,'Referer': 'http://blog.sina.com.cn'}


class Slblog:
    def __init__(self):
        self.write_num = 0

    #获取访问网页内容文本
    async def get_url(self, url):
        async with aiohttp.ClientSession() as client:
            async with client.get(url, headers=headers) as resp:
                if resp.status == 200:
                    return await resp.text()

    #获取列表链接
    async def html_parse(self, html):
        semaphore = asyncio.Semaphore(5) # 有界信号量（等待其中五个协程结束）
        html_parse = etree.HTML(html)
        url_list = html_parse.xpath('//span[@class="atc_title"]/a/@href')
        tasks = [asyncio.create_task(self.img_parse(url, semaphore)) for url in url_list]
        await asyncio.wait(tasks)


    #获取详情页标题及图片链接
    async def img_parse(self, h_url, sem):
        async with sem:   #异步上下文管理器”async with”
            semaphore = asyncio.Semaphore(5)
            h_html = await self.get_url(h_url)
            h_html_parse = etree.HTML(h_html)
            title = h_html_parse.xpath('//h2/text()')[0]
            title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title)  # 剔除不合法字符
            img_demo_url = h_html_parse.xpath(
                '//div[@id="sina_keyword_ad_area2"]//img/@real_src')
            img_url_list = []
            for d_url in img_demo_url:
                img_url = d_url.replace("mw690","orignal").replace("bmiddle","orignal").replace("middle","orignal")
                img_url_list.append(img_url)
            index_list = list(range(1, len(img_url_list) + 1))
            index_dict = dict(zip(img_url_list, index_list))
            tasks = [asyncio.create_task(self.img_con(i_url, i_num, title, semaphore)) for i_url, i_num in
                     index_dict.items()]
            await asyncio.wait(tasks)


    #访问图片或者字节
    async def img_con(self, url, num, title, semaphore):
        async with semaphore:
            async with aiohttp.ClientSession() as client:
                async with client.get(url, headers=headers) as resp:
                    if resp.status == 200:
                        img_con = await resp.read()
                        await self.write_img(img_con, num, title)
                    else:
                        print('请求出错，请尝试调低并发数重新下载！！')

    #下载图片
    async def write_img(self, img_con, num, title):
        if not os.path.exists(title):
            os.makedirs(title)  #创建目录
            async with aiofiles.open(title + '/' + f'{num}.jpg', 'wb') as f:  #异步下载图片
                print(f'正在下载{title}/{num}.jpg')
                await f.write(img_con)
                self.write_num += 1
        else:
            async with aiofiles.open(title + '/' + f'{num}.jpg', 'wb') as f:
                print(f'正在下载{title}/{num}.jpg')
                await f.write(img_con)
                self.write_num += 1

    #主函数
    async def main(self, ):
        q_start_num = input('输入要从第几页开始下载（按Entry默认为1）:') or '1'
        start_num = int(q_start_num)
        total_num = int(input('请输入要下载的页数：')) + start_num
        print('*' * 74)
        start_time = time.time()
        for num in range(start_num, total_num + 1):
            url = f'http://blog.sina.com.cn/s/articlelist_1462278767_0_{num}.html'
            html = await self.get_url(url)
            print('开始解析下载>>>')
            await self.html_parse(html)
        end_time = time.time()
        print(f'本次共下载图片{self.write_num}张，共耗时{end_time - start_time}秒。')



if __name__=="__main__":
    a = Slblog()
    asyncio.run(a.main())

注：该异步版本可以参考学习，适合跟本渣渣一样的弱鸡学习使用！

全部源码打包获取

长按二维码关注公众号