肥宅快乐虫--爬虫爬取某些网站图片

最新推荐文章于 2024-11-05 23:45:00 发布

acgl

最新推荐文章于 2024-11-05 23:45:00 发布

阅读量417

点赞数 2

分类专栏： python #request #爬虫文章标签： request 爬虫美女图片图片爬虫下载图片

本文链接：https://blog.csdn.net/C_acgl/article/details/101020057

版权

python 同时被 3 个专栏收录

6 篇文章 0 订阅

订阅专栏

#request

1 篇文章 0 订阅

订阅专栏

#爬虫

1 篇文章 0 订阅

订阅专栏

欢迎访问我的个人博客
代码中的任何问题可以访问博客邮件联系我

源码如下


import requests
from bs4 import BeautifulSoup
from time import sleep
import os

header ={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
         'cookie':'UM_distinctid=1663384985d6fd-03db9719bc459f-8383268-1fa400-1663384985f5c5; CNZZDATA1256110375=1639678596-1538460026-%7C1538460026; pgv_pvi=6300302336; pgv_si=s7801597952; yunsuo_session_verify=61e64cd8bf75fa31a03c96c3195f46f5'}

topic_count = 0
img_count = 0
err_count = 0
for i in range(1,99):
    try:
        urlstart="http://www.cosplaymore.com"
        url2 = ""
        #根据规律遍历生成98个页面地址
        url = 'http://www.cosplaymore.com/list-30-'+str(i)+'.html'
        #使用requests的get方法访问这98个页面
        r = requests.get(url=url,headers=header,timeout=30)
        #将返回的页面内容通过beautifulsoup的网页解析器解析出来
        r_html = BeautifulSoup(r.text,'html.parser')
        #查找class名字为'con'的div
        html_board = r_html.find('div',class_='cos_list_con clearfix')
        #遍历class名字为‘pic imgholder’的a标签
        for topic_link in html_board.find_all('a',class_='elli'):
            sleep(5)
            topic_count+=1
            # print(topic_link)
            print('访问第'+str(topic_count)+'个帖子')
            # 访问每一个帖子
            url2 = urlstart + topic_link.get('href')
            r_topic = requests.get(url=url2, headers=header, timeout=30)
            # 将帖子内容解析为html网页
            topic_html = BeautifulSoup(r_topic.text, 'html.parser')
            # 找出帖子的标题
            topic_title = topic_html.find('div', class_='title').find('h1').text
            # print(topic_title)
            # 找出这个帖子中的贴图区域
            topic_board = topic_html.find('div', class_='con')
            # 从贴图区域找出所有的img标签
            title_count=0
            for img_link in topic_board.find_all('img'):
                # 从img标签中获取链接
                # print(img_link.get('src'))
                title_count += 1
                img_count += 1
                #print(topic_title+str(title_count))
                #下载图片保存到d盘的img文件夹
                url_next = urlstart + img_link.get('src')
                # print(url_next)
                img_file = requests.get(url=url_next,headers=header,timeout=30)

                # 用目录进行分类
                if not os.path.exists('C:\\Users\\ish\\Desktop\\qshell\\22222\\'+topic_title):
                    os.mkdir('C:\\Users\\ish\\Desktop\\qshell\\22222\\' + topic_title)
                else:
                    pass
                # os.mkdir('C:\\Users\\ish\\Desktop\\qshell\\22222\\' + topic_title)
                # 获取存取路径
                
                img_path = 'C:\\Users\\ish\\Desktop\\qshell\\22222\\'+topic_title+'\\'+str(title_count)+'.jpg'
                if not os.path.exists(img_path):
                    with open(img_path,'wb') as f:
                        f.write(img_file.content)
                        print('图片已保存=====>'+img_path)
                else:
                    print("已有此图片")
    except:
        err_count+=1
        print('出错，跳过'+str(err_count))

print('共爬取到'+str(topic_count)+'个帖子链接')
print('共爬取到'+str(img_count)+'张图片地址')
print('出错的帖子：'+str(err_count)+'个')