Python,爬取整个风景吧每一页的每个帖子里的图片
#!/usr/bin/env python
# coding:utf-8
import re
import urllib
__author__ = 'WL'
x = 1 #图片数目
urllist = []
#获取每一页的地址
def pageurl():
page = []
pagestr = 'http://tieba.baidu.com/f?kw=%E9%A3%8E%E6%99%AF&ie=utf-8&pn='
for i in range(100,26250,50):
c = pagestr+str(i)
page.append(c)
return page
#获取一个页面的html
def getPage(url):
page = urllib.urlopen(url)
html = page.read()
return html
#获取每个帖子的地址
def geturl(url):
ureg = r'<a href="/p/(.+)".title'
urlreg = re.compile(ureg)
global urllist
urllist = re.findall(urlreg, html)
print urllist
return urllist
#获取一个帖子的图片地址
def getImg(html):
#reg = r'src="(http://img.*?\.jpg)">'
reg = r'img class="BDE_Image".+?src="(http://imgs.+?\.jpg)".size'
imgre = re.compile(reg)
imglist = re.findall(imgre, html)
print imglist
return imglist
#下载图片
def downloadImg(imglist):
global x
for i in imglist:
urllib.urlretrieve(i, "C:\Users\wl\Desktop\ooo\%d.jpg" % x)
print "下载第%d张图片!"%x
x += 1
allurl = pageurl()
for n in allurl:
html = getPage(n)
url = geturl(html)
for i in url:
bb = "http://tieba.baidu.com/p/"+i
print bb
zhtml = getPage(bb)
zimg = getImg(zhtml)
downloadImg(zimg)