目录
2020.10.25日更新
需要改动有两个方面:
第一方面,网址编码有变化,如http://jandan.net/ooxx/MjAyMDEwMjUtODY=#comments,尾部的为base64编码,在网址构造时对应处使用base64编码.即可。
第二方面,图片地址发生了小改变,只需要将图片地址加入列表时,增加 http: 即可。
代码:
import urllib.request
import os
import base64
def url_open(url):
opener = urllib.request.build_opener()
# 向opener传入请求头信息
opener.addheaders = ([
('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
])
# 将创建好的opener对象装入request
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
return html[a:b]
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a + 255)
if b != -1:
temp = html[a + 9:b + 4]
address = 'http:' + temp
img_addrs.append(address )
else:
b = a + 9
a = html.find('img src=', b)
return img_addrs
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder='ooxx', pages=10):
os.mkdir(folder)
os.chdir(folder)
url = 'https://jandan.net/ooxx/'
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
s_page = '20201025-' + str(page_num)
base_page = base64.b64encode(s_page.encode('utf-8'))
str_page = str(base_page, 'utf-8')
page_url = url + str_page + '#comments'
img_addrs = find_imgs(page_url)
save_imgs(folder, img_addrs)
if __name__ == '__main__':
download_mm()
值得注意的是,照骗的网址http://jandan.net/ooxx/MjAyMDEwMjUtODY=#comments后边的base64码解码后为 20201025-86,也就是今天的日期+‘-’+照骗页码。猜测网址会随着日期而变化,如果成立,使用时可以将代码段的图示位置按照图中提示改一下即可。也可以编写一个日期读取代码,动态调整该位置的代码。
测试图如下
2020.11.10日更新
上述假设成立,代码更新,自动读取当前日期,因为图片地址随着日期变化。将此处代码
升级:
def get_time():
now_time=datetime.datetime.now().strftime('%Y-%m-%d')
list_time = list(now_time)
list_time.pop(4)
list_time.pop(6)
time_now = ''.join(list_time)
return time_now
time_now = get_time()
注意导入datetime包。
代码
import urllib.request
import os
import base64
import datetime
def url_open(url):
opener = urllib.request.build_opener()
# 向opener传入请求头信息
opener.addheaders = ([
('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
])
# 将创建好的opener对象装入request
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
return html[a:b]
def get_time():
now_time=datetime.datetime.now().strftime('%Y-%m-%d')
list_time = list(now_time)
list_time.pop(4)
list_time.pop(6)
time_now = ''.join(list_time)
return time_now
time_now = get_time()
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a + 255)
if b != -1:
temp = html[a + 9:b + 4]
address = 'http:' + temp
img_addrs.append(address )
else:
b = a + 9
a = html.find('img src=', b)
return img_addrs
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder='ooxx', pages=10):
os.mkdir(folder)
os.chdir(folder)
url = 'https://jandan.net/ooxx/'
page_num = int(get_page(url))
time_now = get_time()
for i in range(pages):
page_num -= i
s_page = time_now + str(page_num)
base_page = base64.b64encode(s_page.encode('utf-8'))
str_page = str(base_page, 'utf-8')
page_url = url + str_page + '#comments'
img_addrs = find_imgs(page_url)
save_imgs(folder, img_addrs)
if __name__ == '__main__':
download_mm()
2021.6.27日更新,更新后可用
近日看了些爬虫,更新一下。
添加注释,修改bug, 网站网址子目录有所更改xxoo=>girl,变为http://jandan.net/girl/,。当前时间该目录下共计1855张妹子图片。 enjoy yourself!
import urllib.request
import os
import base64
import datetime
def url_open(url):
"""
打开网址,并返回网址内容
:param url: 输入含妹子的网页
:return: html:所输入的网页的内容/源码
"""
opener = urllib.request.build_opener()
# 向opener传入请求头信息
opener.addheaders = ([
('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
])
# 将创建好的opener对象装入request
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url):
"""
获取妹子网站最新页码数目,便于遍历
:param url:
:return: 返回最新更新的妹子的页面数字
"""
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
#print(html[a:b])
return html[a:b]
def get_time():
"""
获取目标格式的当前时间
:return: 返回时间,格式如 20210627
"""
now_time = datetime.datetime.now().strftime('%Y-%m-%d')
list_time = list(now_time)
list_time.pop(4)
list_time.pop(6)
time_now = ''.join(list_time)
return time_now
def find_imgs(url):
"""
查找url内的图片地址
:param url: 待查找url
:return: 返回所查找url内的图片地址
"""
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a + 255)
if b != -1:
temp = html[a + 9:b + 4]
address = 'http:' + temp
img_addrs.append(address)
else:
b = a + 9
a = html.find('img src=', b)
return img_addrs
def save_imgs(folder, img_addrs):
"""
下载保存图片
:param folder: 保存图片的文件夹名字
:param img_addrs: 图片地址
:return: 无
"""
print("此页面共计" + str(len(img_addrs)) +"张妹子图片...\n")
i = len(img_addrs)
for each in img_addrs:
i = i + 1
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = url_open(each)
f.write(img)
print("此页面的" + str(len(img_addrs)) +"张妹子图片下载成功...\n")
def download_mm(folder='ooxx'):
"""
主函数
:param folder: 保存图片的文件夹名字
:param pages:
:return: 无
"""
path = os.path.join(os.getcwd(), folder)
if not os.path.exists(path):
os.mkdir(path) # 如果不存在这个pro文件夹,就自动创建一个
else:
print("文件夹已经存在...\n")
os.chdir(folder)
url = 'https://jandan.net/girl/' ##2021.6.27妹子网址被更改
print("正在查找含有妹子的网页...\n")
page_num = int(get_page(url)) #含有妹子网页的最大页码数目
print("查找成功,总共含有妹子的页面数目为: "+str(page_num))
time_now = get_time()
#print(time_now)
num_all = 0 #统计已经下载的图片数目
for i in range(1,page_num):
print("=====================================================")
print("正在对页面" + str(i) + "进行操作...\n" )
s_page = time_now + '-' + str(i)
base_page = base64.b64encode(s_page.encode('utf-8'))
str_page = str(base_page, 'utf-8')
page_url = url + str_page + '#comments'
#print(page_url)
#print(page_url)
print("正在查找妹子图片地址...\n")
img_addrs = find_imgs(page_url)
print("正在下载页面"+ str(i) + "内的妹子图片...\n")
num_all = num_all + len(img_addrs)
save_imgs(folder, img_addrs)
print("已经下载" + str(num_all) + "张图片\n")
if __name__ == '__main__':
#修改下面待存储的文件夹名字,默认为 ooxx
folder = 'ooxx'
download_mm(folder)
print("下载结束\n")
下载完成共计1855张图片,如下图
下一步:
计划更改为使用scrapy框架。