爬取百度贴吧图片
import requests
from bs4 import BeautifulSoup
import urllib.request
def getHtml(url):# 爬取网页源代码
# headers={
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
# }
response=requests.get(url)
response.encoding=response.apparent_encoding # 设置编码
return response.text
def getData(html):
src=[] # 存放图片src链接
soup=BeautifulSoup(html,'html.parser')# BeautifulSoup解析
imglist=soup.find_all('img')# 获取img图片集合
lenth = len(imglist) # 计算图片集合的个数
for i in range(lenth):
src.append(imglist[i].attrs['src'])# 获取src链接
return src
if __name__ == '__main__':
url="https://tieba.baidu.com/p/4803144798"
html=getHtml(url)
img=getData(html)
imgName = 0 # 计数,下载图片个数
for href in img:
if href[0:4]=='http': # 去除不规范src链接
name = "G:\\images\\"+str(imgName)+".jpg" # 图片保存地址及命名拼接
conn = urllib.request.urlopen(href)
f = open(name, 'wb')
f.write(conn.read()) # 将图片写入磁盘
f.close()
imgName += 1
print('正下下载第%s图片' % imgName)