有感而发几篇博客:
"While we teach we learn "
欢迎提问!
会改文件目录的,能直接run----2021.12.09
'''
2021-12-09
1.框架结构
.0头信息使用了用户代理(未使用ip代理)
.1爬取主页获得图片大类类别名称,并写入5.天堂图片网url爬取(目录).txt
.2根据拿出的名称获得其下的图片小类类别名称及url,并追加写入6.天堂图片网url爬取(分目录).txt
.3
'''
import time
from urllib import request
import re
import random
uapools = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763,'
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
]
path = 'C:\\Users\\Administrator\\Desktop\\Python基础与应用\\爬虫\\爬到的东西\\'
def ua(uapool):#用户代理池
opener = random.choice(uapool)
head = ('User-Agent',opener)
opener1 = request.build_opener()
opener1.addheaders = [head]
request.install_opener(opener1)
print('全局用户代理创建成功当前User-Agent:', head)
#获得ivsky一级目录并写入txt
def ivskyoneurl():
url = 'https://www.ivsky.com/tupian/'
pet = '"><a href="/(.*?)" title=".*?">(.*?)</a></li><li '
data1 = request.urlopen(url).read().decode('utf-8', 'ignore')
imagurl = re.compile(pet).findall(data1)
print('类别名称数量:',len(imagurl))
with open(path + '5.天堂图片网url爬取(目录).txt', 'w+', encoding='utf-8') as f:
for page in range(len(imagurl)):
# print(data1[page])
str1 = " ".join(imagurl[page])
f.write(str1 + '\n')
print('5.天堂图片网url爬取(目录).txt写入成功!!!')
#获取5.天堂图片网url爬取(目录).txt内任一关键词
def daleiurlget():
with open(path + '5.天堂图片网url爬取(目录).txt', 'r+', encoding='utf-8') as f:
read = f.readlines()
print("从5.天堂图片网url爬取(目录).txt中读取成功!!!")
urlandname = random.choice(read)
print('将要爬取的类别名称(大类)以及url:', urlandname)
return urlandname
#得到大类类别下的小类url并写入文本文档
def xiaoleiurlget(urlandname):
urle = re.search('(.*?) ', urlandname) # 得到URL
# print(urle.group(0)) #代表已匹配到的对象和所在位置,通过加上.group(0),可以使返回值变为字符串。
url = 'https://www.ivsky.com/' + urle.group(0)
print('获得具体图片类别url:', url)
pet = '</a> <a href="(.*?)" ti.*?>(.*?)</a>'
dataurl = request.urlopen(url).read().decode('utf-8', 'ignore')
imagurl = re.compile(pet).findall(dataurl)
print(imagurl)
with open(path + '6.天堂图片网url爬取(分目录).txt', 'a', encoding='utf-8') as f:
for page in range(len(imagurl)):
# print(data1[page])
str1 = " ".join(imagurl[page])
f.write(str1 + '\n')
print('写入 6.天堂图片网url爬取(分目录).txt成功!!!')
#小类url随机拿出,返回一个url
def takeouturl():
with open(path + '6.天堂图片网url爬取(分目录).txt', 'r+', encoding='utf-8') as f:
read = f.readlines()
print("6.天堂图片网url爬取(分目录).txt中读取成功!!!")
urlandname2=random.choice(read)
print('将要爬取的具体url及名称',urlandname2)
urle = re.search('(.*?) ',urlandname2)
#print(urle.group(0)) #代表已匹配到的对象和所在位置,通过加上.group(0),可以使返回值变为字符串。
urlpage = 'https://www.ivsky.com'+urle.group(0)#页码网页
print('将要爬取的具体大类下的url:',urlpage)
return urlpage
'''
pet2 = '</div><p><a href="(.*?)" '
urlbigimg = 'https://www.ivsky.com/' + pet2 #具体图片链接的上一层
pet1= '</script><img id="imgis".*?//(.*?).jpg.*? alt="(.*?)"></a>'
urlimg = 'https://'+pet1+'.jpg' #具体图片链接,进入此链接后才能下载图片
'''
#下载图片至本地
def downloadimg(urlpage):
pet1= '</script><img id="imgis".*?//(.*?).jpg.*? alt="(.*?)"></a>'
pet2 = '</div><p><a href="(.*?)" '
pa = int(input("请输入爬取开始页数:"))
pe = int(input("请输入爬取结束页数:"))
print(urlpage)
urlpage = urlpage.strip()
imgtxt = []
for page in range(pa,pe+1,1):
try :
url = urlpage + 'index_'+str(page)+'.html'
print(url)
#获得此页面的所有大图链接
databigimaghtml = request.urlopen(url).read().decode('utf-8','ignore')
databigimaghtmlset = re.compile(pet2).findall(databigimaghtml)
print(databigimaghtmlset)
for i in range(len(databigimaghtmlset)):
try:
time.sleep(random.uniform(0.1,0.5))
urlbigimg = 'https://www.ivsky.com' + databigimaghtmlset[i]
#获得含有大图地址的网址:
urlbigimg1 = request.urlopen(urlbigimg).read().decode('utf-8','ignore')
#筛选出大图地址:
databigimagimgset = re.compile(pet1).findall(urlbigimg1)
strdatabigimagimgset= ' '.join(databigimagimgset[0])
petwangzhi = '/(.*?) '
pethanzi = ' (.*)'
wangzhi = re.search(petwangzhi,strdatabigimagimgset).group(0)
wangzhi = wangzhi.strip()
#对图片名称命名
hanzi = re.search(pethanzi,strdatabigimagimgset).group(0)
hanzi = hanzi.strip()
print(wangzhi)
print(hanzi)
#设置img下载地址
downlownimg = path + str(page)+hanzi+str(i)+'.jpg'
imgtxt.append(str(page)+hanzi+str(i)+'.jpg')
#要下载的imgurl
urlimg = 'https://img-pre.ivsky.com' + wangzhi + '.jpg'
print(urlimg)
request.urlretrieve(urlimg,downlownimg)
except Exception as err :
print('---------2层错误:-----------',err)
except Exception as err:
print('---------1层错误:-----------', err)
print('获得的图片名称:',imgtxt)
'''
if page < 25:
databigimaghtmlset1 = []
l = len(databigimaghtmlset)
for i in range(0,l,5):
a = databigimaghtmlset[i]
#print('我是废物?',a)
databigimaghtmlset1.append(a)
databigimaghtmlset = databigimaghtmlset1
'''
def main():
ua(uapools) # 用户代理池
ivskyoneurl()#获得ivsky一级目录
#daleiurlget()#从txt取出一级目录下的类别
xiaoleiurlget(daleiurlget())#从txt取出一级目录下的类别 得到小类类别url并写入文档
#takeouturl()#拿出小类类别url,返回一个url
downloadimg(takeouturl())#取出小类类别url,并获得图片链接然后下载至本地
if __name__ == '__main__':
main()