python 爬取妹子图

1.原谅我是个俗人那...

2.看网上那么多 python 爬取妹子图的,我凑个热闹喔...

3.有不对的 地发,欢迎指正...在此谢过啦....

4.  我的代码  可能 与网上的一些代码有些差异,也正常...  恩,要是 过了 段时间,这代码不能运行了,也不能怪我代码没写好喔.

5.  我的  python  代码,存放 图片的地址,是绝对路径,在ubuntu 下的...至于python 版本嘛. 是 2.7.12没错的...

6. 好了,不废话了,直接上代码:

7. 喔,还有,请自己 看懂代码喔...

#!/usr/bin/env python
#coding=utf-8
import requests
import os
import re
from bs4 import BeautifulSoup

import argparse
import hashlib
import base64
import gzip
import time
import io
class jiandanSpider(object):
    headers = {"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}

    def md5(self,src):
        m = hashlib.md5()
        m.update(src.encode('utf-8'))
        return m.hexdigest()

    def decode_base64(self,url):
        return base64.b64decode(url).decode('utf-8')

    def get_raw_html(self,url):
        html = requests.get(url,headers = self.headers)    
        return html.content

    def get_soup(self,html):
        soup = BeautifulSoup(html, 'lxml')
        return soup


    def get_hashes(self,soup, html):
        hashes = []
        for each in soup.find_all(class_='img-hash'):
            url = self.decode_base64(each.text)
            hashes.append(url)

        return hashes
    def get_timestr(self):
        time_now = int(time.time())
        time_local = time.localtime(time_now)
        dt = time.strftime("%Y-%m-%d_%H:%M:%S",time_local)
        return dt

    def download_images(self,urls,dirpath):
        if not os.path.exists(dirpath):
            os.makedirs(dirpath)
        for url in urls:
            url = "http:" + url
            response = requests.get(url, self.headers)
            img = response.content

            suffix = url.split('.')
            suffix = suffix[len(suffix) - 1]
            print(suffix)#图片后缀啦

            filename = str(self.get_timestr())
            time.sleep(1)#休眠 1s ,防止封地址
            
            with open(dirpath +'/' + filename + '.' + suffix, 'wb') as f:
               f.write(img)
            #
    def get_maxpages(self,soup):
        page = soup.find_all('span',class_='current-comment-page')
        pattern = re.compile(r'\d+')
        m = pattern.search(str(page))
        if m:
            print(m.group(0))
        else:
            print('< --- not found ---->')
            return 0
        max_page = int(m.group(0))
        return max_page

    def sublist_format(self,max_page):
        subl_list=[]
        for i in range(max_page):
            s = 'http://jandan.net/ooxx/page-{}#comments'.format(i)
            subl_list.append(s)
        return subl_list

    def spider(self,url,dirpath, page=10):
        html = self.get_raw_html(url)
        soup = self.get_soup(html)
        
        #print(self.get_maxpages(soup))
        max_pages = self.get_maxpages(soup)#在 当前页面,即是 最大 页面数

        url_list = self.sublist_format(max_pages)#生成 各 页面 url 地址

        for index in range(max_pages):#然后 循环读取url 地址,解析,保存
            html = self.get_raw_html(url_list[index])
            soup = self.get_soup(html)
            urlspath = self.get_hashes(soup,html)
            self.download_images(urlspath,dirpath)
        pass

if __name__ == '__main__':
    #start crawling
    url = 'http://jandan.net/ooxx/'
    dirpath = '/home/menethis/work/Test/pythonTest/meizhi'#这个是我存的地址

    jiandan = jiandanSpider()
    jiandan.spider(url,dirpath)

恩,没错,是煎蛋 网的妹子,很正的喔....

你要是  不喜欢的话(你真的 不喜欢嘛..) 

没关系 ,我这里还有      抓取  抖图 的 python   代码...

同样的,真伪,自己去辨别咯.....

就在此  贴上:欢迎指正 我 ....啦啦啦:

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
class doutuSpider(object):
    headers = {
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
    def get_url(self,url):
        data = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(data.content,'lxml')
        totals = soup.findAll("a", {"class": "list-group-item"})
        for one in totals:
            sub_url = one.get('href')
            global path
            path = '/home/menethis/work/Test/pythonTest'+ '/image/'+sub_url.split('/')[-1]# save image path 
            os.mkdir(path)
            try:
                self.get_img_url(sub_url)
            except:
                pass

    def get_img_url(self,url):
        data = requests.get(url,headers = self.headers)
        soup = BeautifulSoup(data.content, 'lxml')
        totals = soup.find_all('div',{'class':'artile_des'})
        for one in totals:
            img = one.find('img')
            try:
                sub_url = img.get('src')
            except:
                pass
            finally:
                urls =  sub_url#'http:' +
            try:
                self.get_img(urls)
            except:
                pass
    def get_img(self,url):
        filename = url.split('/')[-1]
        global path
        img_path = path+'/'+filename#...
        print(url)
        img = requests.get(url,headers=self.headers)
        try:
            with open(img_path,'wb') as f:
                f.write(img.content)
        except:
            pass
    def create(self):
        for count in range(1, 31):
            url = 'https://www.doutula.com/article/list/?page={}'.format(count)
            print '开始下载第{}页'.format(count)
            self.get_url(url)
if __name__ == '__main__':
    doutu = doutuSpider()
    doutu.create()

 

 

以上所有:

均有出路,给出链接:

1.煎蛋网 OOXX 妹子图爬虫(2)——多线程+多进程下载图片

2.Python爬虫之——爬取妹子图片

3.妹几图,煎蛋认证---->

4.python爬虫——斗图网(跟我斗图,就问你怕不怕)

 

如有 侵权,请作者 联系我删除~~  ... 呵呵哒  ....

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Teleger

你的支持是我前进的方向

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值