给大佬们观赏观赏,爬取美女图片,各位大佬们小心身子哈。
#目标:爬取网页所有图片并保存到本地
#目标url = http://www.umei.cc/tags/meishaonv_1.htm
import requests #自动爬去html页面,自动请求网络提交
from bs4 import BeautifulSoup #解析HTML/XMl页面,提取数据或信息
import time
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
url = 'http://www.umei.cc/tags/meishaonv_1.htm'
urls = requests.get(url,headers=headers)
urls.encoding = urls.apparent_encoding
text = urls.text
soup = BeautifulSoup(text, 'html.parser')
#获取所有页面url
tupian = []
for i in soup.find_all('li', class_="hide"):
asd = i.find('a').get('href') #爬取a标签中的href属性内容
if asd:
tupian.append(url)
tupian.append('http://www.umei.cc' + asd)
asdffg = []
#遍历那些所有的也页面,爬取每个页面中的所有图片
for i in tupian:
time.sleep(2)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
urls = requests.get(i,headers=headers)
urls.encoding = urls.apparent_encoding #转换编码
text = urls.text
soup = BeautifulSoup(text, 'html.parser')
fda = soup.find('div', class_="TypeList") #爬取第一个div标签中带有class_="TypeList"属性的所有子标签信息
asdf = [ss.get('src') for ss in fda.select('ul li a img')] #爬取fda变量中所有的img标签,并使用get只爬取src属性的值
for y in asdf:
asdffg.append(y)
for ii in asdffg:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
urls = requests.get(ii, headers=headers)
with open('D:/IT/图片/' + ii[-7:], 'wb') as f: #自行在该目录创建文件夹
f.write(urls.content) #保存二进制图片要用content方法
print('成功!',ii[-7:])
运行结果: