一、爬取目标
爬虫网址: http://bbs.talkop.com/forum.php?mod=viewthread&tid=54143&page=1&authorid=2
爬虫任务: 点击只看作者 ,爬取62页的图片, 由于url中包含了日期信息,所以根据url中的日期在本地创建目录后在指定目录存档
二、分析
- 先获取62个网页,每个网页的地址
网址是这个样子 http://bbs.talkop.com/forum.php?mod=viewthread&tid=54143&extra=&authorid=2&page=2 各个网页只有最后一个数字的差异 - 爬取单个页面的
img
的地址src
属性
三、实战
- 实战过程发现 此网页需要登陆才能查看,所以得先使用fiddle4抓包工具获取
cookies
,在request.get()中添加参数request.get( url, cookies=cookies ) - 爬下来的网页的src属性不会立即加载,也就是src属性没有图片的url,但是file属性有,所以最后使用了img的file属性
四、完整Python代码
import urllib.request
import requests
from bs4 import BeautifulSoup
import os
class TalkopDownload(object):
def __init__(self, url, startpage, endpage, path):
self.head = url
self.startpage = startpage
self.endpage = endpage
self.path = path
self.cookie = {' #此字段请自己抓包获取'}
self.directory = []
self.url = []
"""
函数说明:获取每个页面的page链接
"""
def get_download_url(self):
for page in range(self.startpage, self.endpage, 1):
self.directory.append(self.head + str(page))
"""
函数说明:获取每个页面下的图片的下载连接
"""
def get_content(self, target):
req = requests.get(target, cookies = self.cookie).text
bf = BeautifulSoup(req, 'html.parser')
contents = bf.find_all(attrs={'class': 't_f'})
for content in contents:
imgs = content.find_all('img')
for img in imgs:
self.url.append(img.get('file'))
def download(self, dl_url):
if dl_url is None:
print("None")
return
filename = dl_url.split('/')[-1:]
dirname = path + dl_url.split('/')[4] + dl_url.split('/')[5] + '/'
if not os.path.exists(dirname):
os.mkdir(dirname)
if not os.path.exists(dirname + filename[0]):
urllib.request.urlretrieve(dl_url, dirname + filename[0])
def download_manager(self):
self.get_download_url()
for page in self.directory:
self.get_content(page)
print("It will download %d picture" % len(self.url))
for link in range(10):
print(self.url[link])
for i in range(len(self.url)):
print("downloading {:d} / {:d} have completed {:.2%}".format(i+1, len(self.url), i/len(self.url)))
self.download(self.url[i])
if __name__ == '__main__':
url = 'http://bbs.talkop.com/forum.php?mod=viewthread&tid=54143&extra=&authorid=2&page='
path = 'E:/Picture/'
TalkopPictureDownload = TalkopDownload(url, 1, 63, path)
TalkopPictureDownload.download_manager()