爬百度贴吧里面的图片
分页获取所有的帖子列表
然后再根据帖子, 按页获取所有的图片
程序不太严格,运行过程可能会崩溃
自行修改链接以及正则表达式等
# -*- coding:utf-8 -*-
import re
import os
import time
import urllib
import urllib.request
# 爬取 唯美图片 贴吧内,前10页的图片 by qzher.com
'''
先获取贴吧的前 nPages 页中所有的贴子,
然后根据每个帖子进行爬取
爬取帖子的时候,又根据该贴有多少页,分页进行抓取
最终实现了nPages页中,所有的图片抓取
'''
nPages = 10
urlPage = "http://tieba.baidu.com/f?kw=%E5%94%AF%E7%BE%8E%E5%9B%BE%E7%89%87&ie=utf-8&pn="
#################################################################
# 爬前 nPages 页的所有标题
def getAllItems():
allTitles = []
for i in range(nPages):
strPageIndex = urlPage + "%d" % (i*50)
print("开始抓取网页:", strPageIndex)
listCurrent = getPageTitleList(strPageIndex)
allTitles += listCurrent
print("\t运行完毕,该网页大约有 %d 个贴 "% len(listCurrent))
return allTitles
def getPageContent(url):
urlOpen = urllib.request.urlopen(url)
urlContent = urlOpen.read()
urlRes = urlContent.decode('utf-8', 'ignore')
return urlRes
# 得到当前页面所有标题栏
def getPageTitleList(url):
urlContent = getPageContent(url)
reComp = re.compile('<a href="(/p/\d.*)\" title=\"')
titleList = reComp.findall(urlContent)
return titleList
#################################################################
# 获取此贴中所有图片
def getPagePicsWithSave(url):
urlContent = getPageContent(url)
lstPic = []
pageCount = []
rePage = re.compile('<span class="red">(\d)</span>')
pageCount += rePage.findall(urlContent)
if len(pageCount) > 0:
print("\t此贴共有 %s 页" % (pageCount[0]))
else:
return
# 分页获取所有图片
i = 1
while i <= int(pageCount[0]):
subPage = url + "?pn=%d" % i
print("\t\t抓取帖子第%d页:%s" % (i,subPage))
i += 1
lstPic += getsubPagePics(subPage)
savePic(lstPic)
def getsubPagePics(url):
pageContext = ""
pageContext += getPageContent(url)
reComp = re.compile('<img class="BDE_Image" src="(.*?)"')
lstPic = reComp.findall(pageContext)
print("\t\t此页共有 %d 张图片" % len(lstPic))
return lstPic
def savePic(picUrlList):
cur = os.getcwd()
saveDir = cur + '/pictures'
if not os.path.exists(saveDir):
os.mkdir(saveDir)
os.chdir(os.path.join(os.getcwd(), 'pictures'))
for i in range(len(picUrlList)):
s = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
picture_name = 'qzher_%s.jpg' % s
try:
urllib.request.urlretrieve(picUrlList[i], picture_name)
print("\t\t下载成功 " + picUrlList[i])
except:
print("\t\t下载失败 " + picUrlList[i])
os.chdir(cur)
# 分页获取所有图片
def threadMain(pageList):
urlPrefix = "http://tieba.baidu.com"
for i in range(len(pageList)):
pageIndex = urlPrefix + pageList[i]
print("\n开始抓取帖子:%s 中的图片" % pageIndex)
getPagePicsWithSave(pageIndex)
#################################################################
def main():
# 获取贴吧所有页数的帖子列表
print("获取所有的页数")
allList = getAllItems()
print("\t找到 %d 个贴\n"% len(allList))
threadMain(allList)
print("下载完毕")
time.sleep(100)
#################################################################
if __name__ == "__main__":
print("---------------------------------------------")
print("开始...")
start = time.time()
main()
end = time.time()
print('\n获取数据完毕 %s ' % (end-start))
print("---------------------------------------------")