使用BeautifulSoup分析网页数据,获取贴吧10页的美图
#-*-coding:utf-8-*-
import re,os,random
from urllib import request
from lxml import etree
from bs4 import BeautifulSoup as btf
import chardet
from time import sleep
def fun(a,b,c): #a下载次数 b单位下载的大小 c图片大小 显示下载进度
global size #把图片大小存入全局变量size
size=c
num=1.0*a*b/c
if num>1:
num=1
print("%.1f%%" %(num*100))
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0",
"Referer":"http://tieba.baidu.com"
}
#下载图片的函数
def get_pic(url):
req=request.Request(url,data=None,headers=header)
res=request.urlopen(req).read()
html=etree.HTML(res)
img=html.xpath("//img")
for i in img:
src=i.attrib['src'] #取得图片的链接地址
name=src.rsplit("/",1)[1] #取得图片名 从最右边分割1次
extension=os.path.splitext(src)[-1] #图片扩展名
if extension in ['.jpg','.png','.gif']: #看扩展名是否是图片
if("?" in name or "&" in name):
name=str(int(random.random()*1000))+".jpg" #不规则的名字只能重新命名
print('%s downloading starts' %src)
try:
request.urlretrieve(src,name,fun) #下载链接图片,保存为name,后三个参数调用fun函数
except Exception as e:
print(e)
else:
if size<20000: #如果图片大小小于20K就删除
os.remove(name)
if __name__=="__main__":
dirname="tieba"
path=os.path.join(os.getcwd(),dirname) #本地路径拼接
if not os.path.isdir(path):
os.mkdir(path)
os.chdir(path)
for i in range(1,10):
url='http://tieba.baidu.com/p/4831235586/'+'?pn='+str(i) #获取贴吧10页
print('第 %s 页 starts' %i)
get_pic(url)
print("------------------------------------------------------")
sleep(1)
效果如下