g.e-hentai 绅士爬虫

# encoding: utf-8
#
#pip install requests
import requests
#pip install BeautifulSoup4
import bs4

import time
import os
import sys


class Spider(object):
    '''
    爬虫下载类

    类变量:
        proxies = 代理
        # 例如使用ss时: proxies = {'http':'http://127.0.0.1:1080'}
    '''
    def __init__(self,proxies=None):
        self.proxies = proxies
        self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


    def __get_all_cataurl(self,catalogue_url):
        '''
        获取初始目录中的所有目录页面的url
        '''
        __all = []
        respond = requests.get(catalogue_url,proxies=self.proxies,headers=self.headers)
        soup = bs4.BeautifulSoup(respond.text,'html.parser')
        dls = soup.find_all(class_='ptt')

        if dls is None:
            __all.append(catalogue_url)

        for dl in dls:
            links = dl.find_all('a')
            for l in links:
                if l.string.isdigit():
                    __all.append(l['href'])
        return __all


    def __get_all_pic(self,cataurl):
        '''
        获取一个目录页面中的所有图片
        '''
        print('catalogue: '+cataurl)

        respond = requests.get(cataurl,proxies=self.proxies,headers=self.headers)
        soup = bs4.BeautifulSoup(respond.text,'html.parser')
        # create dir
        dirname = os.path.dirname(__file__)+'//'+soup.title.text
        dirname = dirname.replace('|','')
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        # write link-url to link.txt
        with open(dirname+"//link.txt","wt+") as f:
            f.write(self.__cata_url)

        # save all pic
        for dls in soup.find_all(class_='gdtm'):
            # get pic-page-url
            picpage_url = dls.a['href']
            print('picpage-url: '+picpage_url)
            # get pic-num-id
            num = picpage_url[picpage_url.rfind('-')+1:]
            pic_path = "%s//%s.jpg" % (dirname,num)
            if os.path.isfile(pic_path) == False:
                # download picture
                r = requests.get(picpage_url,proxies=self.proxies,headers=self.headers)
                s = bs4.BeautifulSoup(r.text,'html.parser')
                for dl in s.find_all('img'):
                    if dl.get('style') is not None:
                        print('=>'+num+'.jpg')
                        r = requests.get(dl.get('src'),proxies=self.proxies,headers=self.headers)
                        with open(pic_path,'wb') as f:
                            f.write(r.content)
                        print('download success!')
            else:
                print('the file already exists')


    def get_pictures(self,catalogue_url):
        '''
        传入第一个页面的目录URL(catalogue_url) 下载所有的绅士图片
        '''
        # get all cata-html
        self.__cata_url = catalogue_url
        for cataurl in self.__get_all_cataurl(catalogue_url):
            self.__get_all_pic(cataurl)



if __name__ == '__main__':
    proxies = {'http':'http://127.0.0.1:1080'}
    #CATAURL = "http://g.e-hentai.org/g/994160/xxxxxx/"
    CATAURL = input("输入目录网址: ");
    s = Spider(proxies)
    s.get_pictures(CATAURL)


转载于:https://my.oschina.net/tasker/blog/812528

  • 3
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值