g.e-hentai 绅士爬虫

最新推荐文章于 2024-05-15 09:59:28 发布

chouhunpo4240

最新推荐文章于 2024-05-15 09:59:28 发布

阅读量10w+

点赞数 3

文章标签：爬虫 python php

原文链接：https://my.oschina.net/tasker/blog/812528

版权

# encoding: utf-8
#
#pip install requests
import requests
#pip install BeautifulSoup4
import bs4

import time
import os
import sys


class Spider(object):
    '''
    爬虫下载类

    类变量：
        proxies = 代理
        # 例如使用ss时： proxies = {'http':'http://127.0.0.1:1080'}
    '''
    def __init__(self,proxies=None):
        self.proxies = proxies
        self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


    def __get_all_cataurl(self,catalogue_url):
        '''
        获取初始目录中的所有目录页面的url
        '''
        __all = []
        respond = requests.get(catalogue_url,proxies=self.proxies,headers=self.headers)
        soup = bs4.BeautifulSoup(respond.text,'html.parser')
        dls = soup.find_all(class_='ptt')

        if dls is None:
            __all.append(catalogue_url)

        for dl in dls:
            links = dl.find_all('a')
            for l in links:
                if l.string.isdigit():
                    __all.append(l['href'])
        return __all


    def __get_all_pic(self,cataurl):
        '''
        获取一个目录页面中的所有图片
        '''
        print('catalogue: '+cataurl)

        respond = requests.get(cataurl,proxies=self.proxies,headers=self.headers)
        soup = bs4.BeautifulSoup(respond.text,'html.parser')
        # create dir
        dirname = os.path.dirname(__file__)+'//'+soup.title.text
        dirname = dirname.replace('|','')
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        # write link-url to link.txt
        with open(dirname+"//link.txt","wt+") as f:
            f.write(self.__cata_url)

        # save all pic
        for dls in soup.find_all(class_='gdtm'):
            # get pic-page-url
            picpage_url = dls.a['href']
            print('picpage-url: '+picpage_url)
            # get pic-num-id
            num = picpage_url[picpage_url.rfind('-')+1:]
            pic_path = "%s//%s.jpg" % (dirname,num)
            if os.path.isfile(pic_path) == False:
                # download picture
                r = requests.get(picpage_url,proxies=self.proxies,headers=self.headers)
                s = bs4.BeautifulSoup(r.text,'html.parser')
                for dl in s.find_all('img'):
                    if dl.get('style') is not None:
                        print('=>'+num+'.jpg')
                        r = requests.get(dl.get('src'),proxies=self.proxies,headers=self.headers)
                        with open(pic_path,'wb') as f:
                            f.write(r.content)
                        print('download success!')
            else:
                print('the file already exists')


    def get_pictures(self,catalogue_url):
        '''
        传入第一个页面的目录URL(catalogue_url) 下载所有的绅士图片
        '''
        # get all cata-html
        self.__cata_url = catalogue_url
        for cataurl in self.__get_all_cataurl(catalogue_url):
            self.__get_all_pic(cataurl)



if __name__ == '__main__':
    proxies = {'http':'http://127.0.0.1:1080'}
    #CATAURL = "http://g.e-hentai.org/g/994160/xxxxxx/"
    CATAURL = input("输入目录网址： ");
    s = Spider(proxies)
    s.get_pictures(CATAURL)

转载于:https://my.oschina.net/tasker/blog/812528

chouhunpo4240

关注

3
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
g.e-hentai 绅士爬虫

# encoding: utf-8##pip install requestsimport requests#pip install BeautifulSoup4import bs4import timeimport osimport sysclass Spider(o...
复制链接

扫一扫