图片
爬取单张图片
#网址:https://pic.netbian.com/ 我们爬取:https://pic.netbian.com/4kmeinv/
from lxml import etree
import requests
import os
if __name__=="__main__":
if not os.path.exists('zhaopian'):
os.mkdir('zhaopian')
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
url='https://pic.netbian.com/4kmeinv/'
response=requests.get(url=url,headers=headers)
#response.encoding= 'utf-8'
resp=response.text
#数据解析
tree=etree.HTML(resp)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
tupian_src='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
#print(tupian_src)
name=li.xpath('./a/b/text()')[0]+'.jpg'
name = name.encode('iso-8859-1').decode('gbk')
#print(name)
data=requests.get(url=tupian_src,headers=headers).content
#存储地址
path='zhaopian/'+name
with open(path,'wb') as fp:
fp.write(data)
print("下载成功!!!")
import os
import requests
from lxml import etree
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"
}
url = "https://pic.netbian.com/4kmeinv/"
girl_data=requests.get(url=url,headers=header).text
# 处理乱码 .decode('gbk') 可去掉
girl_data=girl_data.encode("iso-8859-1").decode('gbk')
#实例化模型
girl_etree=etree.HTML(girl_data)
#xpath表达式代码后说明
picture_loc=girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@src")
picture_name_list=girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@alt")
#新增文件夹
if not os.path.exists("you_knew_about_picture"):
os.mkdir("./you_knew_about_picture")
#增加索引定 picture_name_list
for i,each_loc in enumerate(picture_loc):
#完整网址
new_loc="https://pic.netbian.com/"+each_loc
#爬取图片
each_picture_data=requests.get(new_loc,headers=header).content
#each_picture_name由文件路径和名组成
each_picture_name="you_knew_about_picture/"+picture_name_list[i]+".jpg"
#打开文件
fp=open(each_picture_name,mode="wb")
#写入
fp.write(each_picture_data)
fp.close()
#提示完成
print(each_picture_name.split("/")[-1]+" have been over")
爬取多张图片
#爬取多页:第一页;https://pic.netbian.com/4kmeinv/
#第二页:https://pic.netbian.com/4kmeinv/index_2.html
#第三页:https://pic.netbian.com/4kmeinv/index_3.html
import requests
from lxml import etree
import os
#爬取2-4页
if __name__=="__main__":
if not os.path.exists('zhao'):
os.mkdir('zhao')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
for i in range(2,5):
url=f'https://pic.netbian.com/4kmeinv/index_{i}.html'
#print(url)
resp=requests.get(url=url,headers=headers).text
tree=etree.HTML(resp)
li_list=tree.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
src_url='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
#print(src_url)
src_name=li.xpath('./a/b/text()')[0]+'.jpg'
name=src_name.encode('iso-8859-1').decode('gbk')
#print(name)
data=requests.get(url=src_url,headers=headers).content
path='zhao/'+name
with open(path,'wb') as fp:
fp.write(data)
print('下载成功!!!')
别人代码
import os
import requests
from lxml import etree
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"
}
picture_loc = [] # 图片存地址
picture_name_list = [] # 存图片名
# 第2,3页图片,可自行调节
# 不能包括1,因为1页面网址和后面网址不一样,如想包括,可添加if条件判断
for i in range(2, 4):
# 一定要在循环内,否则一直为"https://pic.netbian.com/4kmeinv/index_2.html"
# 关于为什么后面是/4kmeinv/index_{0}.html 代码后讲解
url = "https://pic.netbian.com/4kmeinv/index_{0}.html"
url = url.format(i)
girl_data = requests.get(url=url, headers=header).text
girl_data = girl_data.encode("iso-8859-1").decode('gbk')
girl_etree = etree.HTML(girl_data, )
# 地址压入
picture_loc.extend(girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@src"))
# 图片名压入
picture_name_list.extend(girl_etree.xpath("//ul[@class='clearfix']/li/a/b/text()"))
if not os.path.exists("you_knew_about_picture"):
os.mkdir("./you_knew_about_picture")
a = 0 # 记录图片个数
for i, each_loc in enumerate(picture_loc):
new_loc = "https://pic.netbian.com/" + each_loc
each_picture_data = requests.get(new_loc, headers=header).content
each_picture_name = "you_knew_about_picture/" + str(a) + " . " + picture_name_list[i] + ".jpg"
fp = open(each_picture_name, mode="wb")
fp.write(each_picture_data)
fp.close()
print(each_picture_name.split("/")[-1] + " have been over")
a = a + 1
print(a)