from gevent import monkey, joinall, spawn
monkey.patch_all()
import requests
import re
import os
BASE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'meizitu2')
girl_list = []
def save_imgs(name, url):
name = name.replace('?', '')
name = name.replace(':', ' ')
if not os.path.exists(os.path.join(BASE_DIR, name)):
os.mkdir(os.path.join(BASE_DIR, name))
print('create path', os.path.join(BASE_DIR, name))
res = requests.get(url)
source = re.search(r'img src="(.*?)"', res.text).group(1)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ,
'cache - control': 'no - cache',
'pragma': 'no - cache',
'upgrade - insecure - requests': '1',
'Referer': "https://www.mzitu.com/1",
}
with open(os.path.join(BASE_DIR, name, source.split('/')[-1]),'wb') as f:
f.write(requests.get(source, headers=headers, timeout=3).content)
print('download successful:', source)
def get_girl_pics(url):
res = requests.get(url)
last_page = re.findall(r'<span>(\d+)</span>', res.text)[-1]
title = re.search(r'<h2 class="main-title">(.*?)</h2>', res.text).group(1)
save_imgs(title, url)
for i in range(2, int(last_page) + 1):
save_imgs(title, ''.join([url, '/', str(i)]))
def get_all_grils(url):
global girl_list
res = requests.get(url)
pages = re.findall(r'<li><a href="(.*?)" target="_blank">',res.text)
for i in pages:
girl_list.append(i)
def get_url_lists():
url = 'https://www.mzitu.com/mm'
url_pages = [url]
res = requests.get(url)
girl_pages = re.findall(r'</span>(.*?)<span class="meta-nav', res.text)
for i in range(2, int(girl_pages[-1]) + 1):
url_pages.append(''.join([url, 'page/', str(i)]))
print(url_pages)
joinall([spawn(get_all_grils, i) for i in url_pages])
if __name__ == '__main__':
get_url_lists()
joinall([spawn(get_girl_pics, i) for i in girl_list])