python多线程爬虫爬取图片

最新推荐文章于 2022-11-25 19:05:48 发布

mxtianv

最新推荐文章于 2022-11-25 19:05:48 发布

阅读量412

点赞数 1

分类专栏：爬虫

本文链接：https://blog.csdn.net/mxtianv/article/details/106090017

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

# 先导入所需模块
import requests
import re
import threading
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

# 生成访问代理
headers = {
    "User-Agent": UserAgent().random
}

# 爬取目标地址

def crawl(n): # n为爬取页数
    v = 2
    while v <= n:
        url = "http://pic.netbian.com/4kfengjing/index_" + str(v) + ".html"
        res = requests.get(url=url, headers=headers)
        html = res.content.decode('gbk')
        bs = BeautifulSoup(html, "html.parser")
        con = bs.select(".clearfix li")
        for i in con:
            img_url = "http://pic.netbian.com" + re.findall(r'<a href="(.+)" target="_blank">', str(i))[0]
            th = threading.Thread(target=crawlImg, args=(img_url,)) # 创建多线程
            th.start() # 启动多线程
        v = v + 1

# 解析下载图片

def crawlImg(url):
    res = requests.get(url=url, headers=headers)
    html = res.content.decode('gbk')
    bs = BeautifulSoup(html, "html.parser")
    img_name = bs.select(".view .photo-hd h1")[0].text
    img_url = "http://pic.netbian.com" + bs.select(".view .photo-pic #img img")[0].get('src')
    print(img_url, end=" ")
    print(img_name)
    con = requests.get(img_url)
    # 保存图片到img文件夹
    f = open('img/' + img_name + '.jpg', 'wb')
    f.write(con.content)
    f.close()

# 运行项目

if __name__ == "__main__":
    crawl(10) # 参数为爬取页数

mxtianv

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录