话不多说,直接上代码。
mport urllib.request
import re
from lxml import etree
def gethtml(start):
#str(start)=0表示第一页,50表示第二页,……。
url = 'http://tieba.baidu.com/f?kw=%E9%AB%98%E8%80%83&ie=utf-8&pn='+str(start)+''
page = urllib.request.urlopen(url).read()
html = etree.HTML(page)
imglink =html.xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a/@href')
for i in imglink:
# print(i)
get_son_html(page,i)
def get_son_html(page,i):
son_page = urllib.request.urlopen('http://tieba.baidu.com'+i).read().decode('utf-8')
getimg(son_page,i)
def getimg(son_page,i):
x=1
path = 'F:\\baidutieba\\'
reg = r'src="(http://imgsrc.*?\.jpg)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,son_page)
list = filter(None,imglist)
for i in list:
x=1
img = urllib.request.urlopen(i)
imgresp = img.read()
pic = open(path+i.split('/')[-1],'wb')
pic.write(imgresp)
x+=1
print('正在保存……')
if __name__=='__main__':
for n in range(0,5):
gethtml(start=n*50)