今天看了小甲鱼的视频,学着爬煎蛋网上的图片。由于是2014年的视频,有比较多的地方要改,还是花了一些功夫的。代码如下。
import urllib.request
import base64
import os
from datetime import datetime
def url_open(url):
"打开页面"
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605'
'.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15') # 添加客户名用来混淆服务器
response = urllib.request.urlopen(req)
return response.read()
def statard_form(page):
'根据页数将对应页的地址标准化'
page = datetime.now().strftime('%Y%m%d') + '-' + str(page)
page = page.encode('utf-8')
page_b64 = base64.b64encode(page).decode('utf-8')
return 'http://jandan.net/ooxx/' + page_b64 + '#comments'
def get_pages(url):
"求出总的页数"
html = url_open(url).decode('utf-8')
a = html.find('"current-comment-page">[') + len('"current-comment-page">[')
b = html.find(']', a)
return html[a:b]
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = [] #将图片地址保存在一个列表中
a = html.find('img src="')
while a != -1:
b = html.find('.jpg', a, a+255)
if b != -1:
img_addrs.append('http:' + html[a+len('img src="'):b+len('.jpg')]) #获取图片地址
else:
b = a + len('img src="')
a = html.find('img src="', b)
return img_addrs
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1] #从后边往前数
with open(filename, 'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder = 'ooxx', pages = 10):
folder = input('请输入你要创建的文件夹名(default=ooxx):')
pages = int(input('请输入你要下载的页数(default=10):'))
os.mkdir(folder) #创建文件夹
os.chdir(folder) #更改存储位置为新建的文件夹
url = 'http://jandan.net/ooxx' #最初的网页
page_num = int(get_pages(url))
print('下载中···')
for i in range(pages):
page_num -= i
page_url = statard_form(page_num)
img_addrs = find_imgs(page_url)
save_imgs(folder, img_addrs)
print('下载完毕!')
if __name__ == '__main__':
download_mm()
在mac的pycharm上运行效果:
文件夹中的下载图片情况: