python爬虫练手之斗图啦

互联网时代,难免会和别人在线上聊天,而现在的年轻人呐!一言不合就开始斗图!我难道就默默看着别人装逼吗?NO!拒绝! 所以呢借此机会我们找个表情网站,爬一波图片啦

2

斗图网链接: https://www.doutula.com

由于网站结构比较简单,没有异步加载,直接从html就能查找信息啦,所以就不做详细分析~

#coding:utf-8
import requests
import os
from lxml import html
from multiprocessing import Pool
class doutula():
    base_url = 'https://www.doutula.com/'
    headers={
            'accept-encoding':'gzip, deflate, sdch, br',
            'accept-language':'zh-CN,zh;q=0.8',
            'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
    }

    def get_selector(self,url):
        return html.fromstring(requests.get(url,headers=self.headers).text)

    ## 获取第num页下每个套图的入口地址
    def get_page_link(self,num):
        now_url = "{}/article/list/?page={}".format(self.base_url,num)
        selector = self.get_selector(now_url)
        page_link = []
        for i in selector.xpath('//ul[@class="list-group"]/a/@href'):
            page_link.append(i)
        return page_link

    # 获取详细页下的信息,返回一个元组,包括标题和链接的列表
    def get_page_detail(self,url):
        pic_list = []
        selector = self.get_selector(url)
        # 获取详细页的标题,图片连接
        title = selector.xpath('//li[@class="list-group-item"]/h3/blockquote/a/text()')[0]
        pic_link = selector.xpath('//div[@class="artile_des"]/table/tbody/tr/td/a/img/@src')
        for i in pic_link:
            pic_list.append(i)
        return title,pic_link

    def Make_dir(self,title):
        future_dir = "{}/{}".format(os.path.abspath('.'),title)
        if os.path.exists(future_dir):
            print(u'文件夹已存在,跳过')
            return False
        else:
            os.mkdir(future_dir)
            print(title,u'文件夹创建完成')
            return True

    def down_load(self,page_info):
        count = 1
        title = page_info[0]
        pic_link = page_info[1]
        if self.Make_dir(title):
            for i in pic_link:
                now_path = "{}/{}/{}.jpg".format(os.path.abspath('.'), title, str(count))
                print(now_path)
                page_link = "https:{}".format(i)
                with open(now_path,'wb') as f:
                    f.write(requests.get(page_link,headers=self.headers).content)
                count+=1

def run(self,num=1):
    for i in self.get_page_link(num):
        self.down_load(self.get_page_detail(i))


if __name__ == '__main__':
    dt = doutula()
    dt.run(1)

1

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值