爬取妹子图片

爬取图片有一个要注意的点是要对jpg那条链接在解析一次然后.write写入文件。

import requests
import re

'''深入的源代码才能发现契机~'''

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}

path = 'D:/mmphoto/'
urls = ['https://www.51xw.net/meizi/hot/page/{}'.format(i) for i in range(1,3)]
for url in urls:
    r = requests.get(url,headers = headers)
    Big = re.findall('<span><a href="(.*?)" target="_blank">',r.text)
    for small in Big:
        for j in range(1,7):
            link = small + '/' + str(j)
            photo_html= requests.get(link,headers = headers)
            specific_photo = re.findall('<img src="(.*?)" alt',photo_html.text)
            k = 1
            for x in specific_photo:
                data = requests.get(x,headers = headers)
                fp = open(path + str(k) + x[-10:],'wb')#注意写进文件时,切片的字符串名字不可有/出现
                fp.write(data.content)
                fp.close()
                k += 1


爬取过程很慢。。大概1,2十分钟。
。。。结果就不展示了吧,,羞耻。。。
用了四重循环,第一个大循环是遍历所有大页面的url,第二重循环是在大页面的网页进入具体的主题。。羞耻。。第三个循环是在具体的主题里切换页面1,2,3,4。第四个循环是把图片写入文件。。话说网站为啥要把图片藏得那么深~滑稽

2月15日更新,上新代码

import os
import requests
import re
from lxml import etree
from random import randint

headers = {
    'referer': 'https://www.mzitu.com/168151',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}


def get_html(url):
    try:
        content = requests.get(url, headers=headers)
        if content.status_code == 200:
            return content.content.decode()# .text不好,无法避免gbk编码的错误
        else:
            return None
    except:
        return None


def get_entrance_url(html):
    selector = etree.HTML(html)
    url_list = selector.xpath('//ul[@id="pins"]/li/a/@href')
    return url_list


def get_next_page_first(html):
    next_page_url = re.findall(
        '<a class="next page-numbers" href="(.*?)">下一页', html, re.S)
    return next_page_url


def get_info(html):
    selector = etree.HTML(html)
    photo_link = selector.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
    file_name = selector.xpath('//title/text()')[0]
    # print(file_name)
    # print(photo_link)
    path = 'D:/shepi/'
    file_name = file_name[0:10]
    real_path = path + file_name
    response = requests.get(photo_link, headers=headers)
    name = photo_link[-7:]
    with open(real_path + name, 'wb') as fp:
        fp.write(response.content)
    # if os.path.exists(path + file_name):
        
    # else:
    #     os.mkdir(path + file_name)
    #     real_path = path + file_name
    #     response = requests.get(photo_link, headers=headers)
    #     name = photo_link[-7:]
    #     with open(real_path + name, 'wb+') as fp:
    #         fp.write(response.content)

    next_page = selector.xpath("//div[@class='pagenavi']/a[6]/@href")
    print(next_page)
    if next_page:
    	data = get_html(next_page[0])
    	get_info(data)


if __name__ == '__main__':
    url = 'https://www.mzitu.com'
    html = get_html(url)
    url_list = get_entrance_url(html)
    # get_info(html)
    for link in url_list:
        # print(link) 测试成功
        html = get_html(link)
        get_info(html)
        
  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值