python 分页爬取

最新推荐文章于 2024-04-18 13:50:51 发布

大数据东哥(Aidon)

最新推荐文章于 2024-04-18 13:50:51 发布

阅读量2.4k

点赞数

分类专栏： python 文章标签： python 分页爬取

本文链接：https://blog.csdn.net/u010839779/article/details/77346254

版权

python 专栏收录该内容

42 篇文章 1 订阅

订阅专栏

二十、python 分页爬取(百思不得姐信息爬取)

import requests

from lxml import etree

import datetime

#获取段子的内容

def getJokeList(basurl='http://www.budejie.com/text/{0}'):

nextPage = True

pageNum = 1

while nextPage:

url = basurl.format(pageNum)

response = requests.get(url)

selector = etree.HTML(response.text)

jokes = selector.xpath('//*/div[@class="j-r-list-c-desc"]/a/text()')

for joke in jokes:

yield joke

hasNext = selector.xpath('//a[@class="pagenxt"]')

if hasNext:

pageNum += 1

else:

nextPage = False

# print pageNum

#获取段子内容、赞、分享、收藏数

def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):

nextPage = True

pageNum = 1

while nextPage:

url = basurl.format(pageNum)

response = requests.get(url)

selector = etree.HTML(response.text)

all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')

for a in all:

joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]

like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]

down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]

share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]

comment = a.xpath( 'div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]

#print joke, '=====', like, '====', down, '====', share.replace(u"分享??",""), '=====', comment, '====='

yield joke, like, down, share.replace(u"分享??",""), comment

hasNext = selector.xpath('//a[@class="pagenxt"]')

if hasNext:

pageNum += 1

else:

nextPage = False

print pageNum

if __name__ == "__main__":

f = open('basejie.txt','w')

# for joke in getJokeList():

# #print joke

# f.writelines(joke.encode('utf-8'))

# f.writelines('\n')

# f.writelines('~'*100)

# f.writelines('\n')

# f.close()

###############################################

#getJokeOfAllList()

for joke, like, down, share, comment in getJokeOfAllList():

print joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8")

f.writelines(joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8"))

f.writelines('\n')

f.close()

大数据东哥(Aidon)

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
0
评论
python 分页爬取

python 分页爬取
复制链接

扫一扫

专栏目录

python 分页爬取

“相关推荐”对你有帮助么？