python 分页爬取

 二十、python 分页爬取(百思不得姐信息爬取)

import requests

from lxml import etree

import datetime

 

#获取段子的内容

def getJokeList(basurl='http://www.budejie.com/text/{0}'):

    nextPage = True

    pageNum = 1

    while nextPage:

        url = basurl.format(pageNum)

        response = requests.get(url)

        selector = etree.HTML(response.text)

        jokes = selector.xpath('//*/div[@class="j-r-list-c-desc"]/a/text()')

        for joke in jokes:

            yield joke

        hasNext = selector.xpath('//a[@class="pagenxt"]')

        if hasNext:

            pageNum += 1

        else:

            nextPage = False

       # print pageNum

 

#获取段子内容、赞、分享、收藏数

def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):

    nextPage = True

    pageNum = 1

    while nextPage:

        url = basurl.format(pageNum)

        response = requests.get(url)

        selector = etree.HTML(response.text)

        all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')

        for a in all:

            joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]

            like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]

            down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]

            share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]

            comment = a.xpath( 'div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width  j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]

            #print joke, '=====', like, '====', down, '====', share.replace(u"分享??",""), '=====', comment, '====='

            yield joke, like, down, share.replace(u"分享??",""), comment

        hasNext = selector.xpath('//a[@class="pagenxt"]')

        if hasNext:

            pageNum += 1

        else:

            nextPage = False

        print pageNum

 

if __name__ == "__main__":

    f = open('basejie.txt','w')

    # for joke in getJokeList():

    #     #print joke

    #     f.writelines(joke.encode('utf-8'))

    #     f.writelines('\n')

    #     f.writelines('~'*100)

    #     f.writelines('\n')

    # f.close()

 

    ###############################################

    #getJokeOfAllList()

    for joke, like, down, share, comment in getJokeOfAllList():

        print joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8")

        f.writelines(joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8"))

        f.writelines('\n')

    f.close()

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大数据东哥(Aidon)

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值