python2爬取妹纸图片

很简单的一个爬美女图片的程序,直接上代码:

# -*- coding:utf-8 -*-
# python2运行

import requests
import os
from bs4 import BeautifulSoup
import time

request = requests.Session()
#返回向url请求所获得的html
def getHtml(url):
    headers = {
    'Referer':'http://www.mmjpg.com',
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5383.400 QQBrowser/10.0.1313.400"
    }
    rs = request.get(url,headers=headers)
    return rs.content
#向指定的url下载图片
def savePic(url,path,index):
    headers = {
    'Referer':'http://www.mmjpg.com',
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5383.400 QQBrowser/10.0.1313.400"
    }
    rs = request.get(url,headers=headers)
    with open(os.path.join(path,str(index)+".jpg"),'wb') as f:
        f.write(rs.content)
        f.close()

url = "http://www.mmjpg.com"
# index_soup = BeautifulSoup((getHtml(url)),'html.parser')

# pages_list = index_soup.find_all('li')

page_url_list=[]
# for curr in pages_list:
    # page_url_list.append(curr.a['href'])
count = 0
while count <= 2000:
    page_url = url +'/mm/' +str(count)
    page_url_list.append(page_url)
    count = count + 1
# print page_url_list
#对于每个链接再进行请求
dir_index = 1
for curr_url in page_url_list:
    try:
        soup = BeautifulSoup(getHtml(curr_url),'html.parser')
    except Exception, e:
        time_sleep_count = 0
        while True:
            time.sleep(1)
            try:
                soup = BeautifulSoup(getHtml(curr_url),'html.parser')
                break
            except Exception, e:
                time_sleep_count = time_sleep_count + 1
                if time_sleep_count == 100:
                    print "你的网络可能出问题,请检查"
                    break
                continue
    if len(soup.select('.article')) == 0:
        continue;
    title = soup.select('.article')[0].h2.get_text().encode('utf-8')
    print "准备爬取:"+title
    total = 1
    page = soup.select('.page')[0]
    for children in page.children:
        if len(children.contents) > 0 and len(children.contents[0].encode('utf-8')) > 0 and len(children.contents[0].encode('utf-8')) < 4:
            # print len(children.contents[0].encode('utf-8'))
            curr = int(str(children.contents[0].encode('utf-8')))
            if curr > total:
                total = curr
    # print total
    #生成每个系列所有的图片地址所在页面的地址的列表
    pages= []
    pages.append(curr_url)
    for i in range(2,total+1):
        pages.append(curr_url+"/"+str(i))
    #生成目录
    index = 1
    print "当前目录为:",os.getcwd()
    path = os.path.join(os.getcwd(),str(dir_index))
    print path
    if not os.path.exists(path):
        os.makedirs(path)
        print path+"文件夹创建成功"

    for url in pages:
        soup = BeautifulSoup(getHtml(url),'html.parser')
        pic_url = soup.select('.content')[0].img['src']
        savePic(pic_url,path,index)
        index = index + 1

    dir_index = dir_index + 1

思路:
关键是自己去找规律,找到规律其实就是很简单,祖国栋梁,保重身体,哎。

我现在就是一页一页的去找每个图片系列一共有多少张图片,然后去每张图片对应的界面里面找那张图片的地址,向这个地址请求得到数据保存成图片就可以。

记得一定要加上referer,不然我自己在下载的时候老是一张很好看的风景图片哈哈。

不懂的地方欢迎留言,enjoy!


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值