版本:python 2.7
功能:抓取某贴吧里所有帖子的图片。“只看楼主”
原理:简单的遍历
注意:1,将图片下载到指定文件夹下(需手动创建)。
2,需拷贝目标贴吧的url地址,去掉其尾部变量pn的值。例如:美女吧url:http://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50。
去掉最后面的50,将其粘贴到代码中指定位置即可。
代码如下:
#! /usr/bin/env python
#coding=utf-8
import re,time
import urllib2,urllib
import sys
type = sys.getfilesystemencoding()
root = "D:\\meinv\\" # 改成你想要的文件夹
def tiebaImgiDownloader(url):
pattern = r'img class="BDE_Image" .*?src="(.*?jpg)"'
fstr = urllib2.urlopen(url+'?see_lz=1').read()
urllist = re.findall(pattern,fstr)
urllist = list( set(urllist) )
print '总共爬取%d个图片链接'.decode('UTF-8').encode(type) %len(urllist)
i = 1
for furl in urllist:
timestr = time.strftime('%Y%m%d%H%M%S')
'''urllib.urlretrieve(furl,timestr+'0%d.jpg'%i) 'C:\img\%s.jpg'%i
picName = timestr+'0%d.jpg'%i'''
urllib.urlretrieve(furl,root + timestr + '0%d.jpg'%i)
print '已保存图片'.decode('UTF-8').encode(type) ,timestr+'0%d.jpg'.decode('UTF-8').encode(type) %i
i+=1
#time.sleep(0.1)
print '图片下载完毕!'.decode('UTF-8').encode(type)
return True
def __main__():
print '欢迎使用贴吧jpg格式图片下载器!'.decode('UTF-8').encode(type)
for totalIndex in range(0,1000):# 暂时写的1000
print '现在开始第%d版'.decode('UTF-8').encode(type) % totalIndex
html = urllib.urlopen('http://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn='+str(totalIndex*50)).read() # 目标贴吧url地址粘贴在这里
pattern = r'a href="(.p.[0-9]*)"'
urllist = re.findall(pattern, html)
urllist = list(set(urllist))
preurl = r'http://tieba.baidu.com'
print '抓取第%d个二级网页'.decode('UTF-8').encode(type) %len( urllist )
i = 1
for urlOne in urllist:
print '正在抓取第%d个二级网页,总共%d个二级页面,当前第%d版'.decode('UTF-8').encode(type) % ( i , len( urllist ),totalIndex )
tiebaImgiDownloader(preurl + urlOne)
time.sleep(0.5)
i+=1
return 0
if __name__ == '__main__':
__main__()