import webbrowser, requests, bs4
#webbrowser.open("https://Xkcd.com")
number = 1
PreveUrl = 'https://Xkcd.com'
def downLoadImage(url,number):
XkcdHtml = requests.get(url) # 获取主页源码
XkcdBs = bs4.BeautifulSoup(XkcdHtml.text, "html.parser") # 获取bs4对象
imgUrlList = XkcdBs.select('img') # 选择img标签
imageData = requests.get('http:'+imgUrlList[1].get('src')) # 获取图片URL
imageFile = open('image/'+str(number)+'.png', 'wb') # 下载图片
for i in imageData.iter_content(100000):
imageFile.write(i)
imageFile.close()
preve = XkcdBs.select('a[rel]')
preveUrl = "https://Xkcd.com" + preve[1].get('href') # 拼接上一个页面的URL
return preveUrl
while 1:
PreveUrl = downLoadImage(PreveUrl,number)
number += 1
if PreveUrl.strip('/') == '1':
break
书本上的答案:
url = 'https://xkcd.com'
os.makedirs('xkcd', exist_ok=True)
while not url.endswith('#'):
#Todo: Download the page.
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
#Todo:Find the URL of th comic image.
comicElem = soup.select('#comic img')
print(comicElem)
if comicElem == []:
print('Could not find comic image')
else:
comicUrl = 'http:' + comicElem[0].get('src')
print('Dowunloading image %s...' % comicUrl)
res = requests.get(comicUrl)
res.raise_for_status()
#Todo:Save the image to ./xkcd.
imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
#Todo: Get the Prev button's url.
prevLink = soup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com' + prevLink.get('href')
print('Done.')
总结:爬取图片的步骤:
第一步:先获取网页的源码 res = requests.get(url),获取respond对象
第二步:将源码文本信息转换为 BeautifulSoup对象, soup = bs4.BeautifulSoup(res.text)
第三步:使用选择器筛选内容。(标签上一级有id ,筛选字符为‘#上以及标签ID 目标标签 。没有ID,则‘目标标签[属性+属性值]’’)
第四步:重新打开url,res=requests.get(url)
第五步:下载,res.iter_content(100000)