import urllib.request import os def url_open(url): req = urllib.request.Request(url) req.add_headers ={"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.3.1.14600'} req.add_headers = ('Referer','http://jandan.net/ooxx') response = urllib.request.urlopen(req) html = response.read() return html def get_page(url): html = url_open(url).decode('utf-8') a = html.find('current-comment-page') + 23 b = html.find(']',a) return html[a:b] def find_imgs(url): html = url_open(url).decode('utf-8') img_addrs = [] a = html.find('img src=') while a != -1: b = html.find('.jpg',a,a+255) if b!= -1: img_addrs.append(html[a+9:b+4]) else: b = a + 9 a = html.find('img src=', b) return img_addrs def save_imgs(folder, img_addrs): for each in img_addrs: filename = each.split('/')[-1] with open(filename,'wb') as f: img = url_open('http:' + each) f.write(img) def download_mm(folder='ooxx1',pages=10): os.mkdir(folder) os.chdir(folder) url = 'http://jandan.net/ooxx/' page_num = int(get_page(url)) for i in range(pages): page_num -= i page_url = url + 'page-' + str(page_num) + '#comments' img_addrs = find_imgs(page_url) save_imgs(folder, img_addrs) if __name__ == '__main__': download_mm()
爬虫爬取下载图片
最新推荐文章于 2024-05-08 15:02:16 发布