import urllib.request
import reimport os
# get html by url(存在解码的问题)
def getpage(url):
try:
req=urllib.request.Request(url)
res=urllib.request.urlopen(req)
return res.read().decode('utf-8')
except:
pass
def getpagenum(page):
pat=re.compile('pn=([0-9]*)" class="last pagination-item')
items=re.findall(pat,page)
for item in items:
print('该贴吧总页数为'+item)
return int(item)
#获得帖子所在网址和帖子作者
def getthreadlist(page):
pat=re.compile(r'<div class="threadlist_lz clearfix">.+?(/p/[0-9]+).+?<a data-field=.*?>(.*?)</a>',re.S)
return re.findall(pat,page)
# 打开帖子,取得帖子下的图片内容,并统计发帖人,图片数,保存图片和id
##print(getpage('http://tieba.baidu.com/f?kw=%E8%85%BF&ie=utf-8&pn=100'))
##getpagenum(getpage('http://tieba.baidu.com/f?kw=%E8%85%BF&ie=utf-8&pn=100'))
getthreadlist(getpage('http://tieba.baidu.com/f?kw=%E8%85%BF&ie=utf-8&pn=100'))
#get imglist
def getimglist(page):
pat=re.compile(r'http://imgsrc.baidu.com/forum/w%3D580/sign=.*?jpg')
items=re.findall(pat,page)
return items
def saveimg(imgURL,fileName):
u=urllib.request.urlopen(imgURL)
data=u.read()
f=open(fileName,'wb')
f.write(data)
print('正在保存一张图片为'+fileName)
f.close()
print(getimglist(getpage('http://tieba.baidu.com/p/4608598265')))
#创建新目录
def mkdir(self,path):
path = path.strip()
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
return False
pagenum=getpagenum('http://tieba.baidu.com/f?kw=%E8%85%BF&ie=utf-8&pn=0')
nownum=0
imgnum=1
for thread in getthreadlist(getpage('http://tieba.baidu.com/f?kw=%E8%85%BF&ie=utf-8&pn='+str(nownum))):
for img in getimglist(getpage('http://tieba.baidu.com'+thread[0])):
#图片命名采用作者id+图片地址的末四位
saveimg(img,thread[1]+'__'+img[-8:])print(img,thread[1]+'__'+img[-8:])
print('pictures are '+str(imgnum))
imgnum+=1
nownum=nownum+50