作业题目如下
- XKCD极客漫画网上的照片进行爬爬取
- 根据pre这个按钮自动进入下一个漫画的链接进行下载
- 重复第一步第二步
解决思路
慢慢一步步先下载一个网页上需要的png图像,之后递归即可
import requests,bs4
from urllib import request
import chardet
import os
import time
timestart=time.time()
print('start'+str(timestart))
# #源码获得
# response = request.urlopen("http://xkcd.com")
# html = response.read()
# charset = chardet.detect(html)
# # 对该html进行编码的获取
# cha=charset['encoding']# 打印编码格式}'ascii'
# a=requests.get("http://xkcd.com").content
# print(a)
#下载图片的链接
# soup=bs4.BeautifulSoup(requests.get("http://xkcd.com").content,features='html.parser')
# c=soup.select('#comic img')
# for i in c:
# print(i['src'])
# for i in c:
# if i.getText()=='< Prev':
# print(i['href'])
# break
##地址为/html/body/div[2]/div[2]/img
# #下载手续
# b=requests.get("http://imgs.xkcd.com/comics/mbmbam.png").content
# with open('asd','wb') as f:
# f.write(b)
# #下一页的链接
# soup=bs4.BeautifulSoup(a,features='html.parser')
# c=soup.select('.comicNav a')
# print(c)
# for i in c:
# if i.getText()=='< Prev':
# print(i['href'])
# break
# #路径/html/body/div[2]/ul[1]/li[2]/a
# #循环
lisdata=[]
pic=[]
def xunhuan(src,n,j=0):
a = requests.get(src).content.decode('ascii')
soup = bs4.BeautifulSoup(a,features='html.parser')
c = soup.select('.comicNav a')
for i in c:
if i.getText() == '< Prev':
lisdata.append(i['href'])
break
j=j+1
soup = bs4.BeautifulSoup(requests.get(src).content, features='html.parser')
c = soup.select('#comic img')
for i in c:
pic.append(i['src'])
if int(j)==int(n):
return lisdata,pic
else:
xunhuan('https://xkcd.com'+lisdata[j-1], n,j)
xunhuan('https://xkcd.com',10)
# #下载
# #for i in data
'''得到的数据lisdata为图片编号,pic为下载图片的地址
lisdata
['/2315/']
pic
['//imgs.xkcd.com/comics/hair_growth_rate.png']'''
print(lisdata)
print(pic)
for i in range(len(pic)):
with open('/home/roy/PycharmProjects/untitled/venv'+str(i)+'.png', 'wb') as f:
print('downloading'+lisdata[i]+'on'+pic[i])
f.write(requests.get('http:'+pic[i]).content)
print('done'+lisdata[i])
print('time cost'+str(time.time()-timestart))
#first time cost time cost149.1481626033783
介绍一款软件anaconda(高度集成,数据分析和爬虫很适合)
使用界面
有丰富的教程
和多样的应用
下载链接
https://www.anaconda.com/
爬坑注意事项:
UnboundLocalError: local variable ‘DISTRO_NAME’ referenced before assignment#关于此问题的解决,更改159行中的缩进问题