from multiprocessing import Pool
import os
import urllib
from lxml import etree
import requests
import re
def get_list(url):
a=[]
response = requests.get(url).text
html = etree.HTML(response)
lis = html.xpath('//div[@class="box_left3"]/div[@class="channel_picbox"]/div[@class="channel_list"]/ul/li')
for li in lis:
url = li.xpath('./a/@href')[0]
response = requests.get(url).text
html = etree.HTML(response)
# src = html.xpath('//div[@class="photo"]/a/img/@src')[0]
a.append(url)
print(a)
return a
def get_page(url):
a = []
response = requests.get(url).text
html = etree.HTML(response)
src = html.xpath('//div[@class="photo"]/a/img/@src')[0]
a.append(src)
html = etree.HTML(response)
page = html.xpath('//div[@class="pages"]/ul/li[1]/a/text()')[0]
page=re.findall("(\d+)",page)[0]
for page in range(2,int(page)+1):
src = list(url)
src[-5:-6]="_{}".format(page)
b= "".join(src)
print(b)
response = requests.get(b).text
html = etree.HTML(response)
if not html.xpath('//div[@class="photo"]/a/img/@src'):
continue
src = html.xpath('//div[@class="photo"]/a/img/@src')[0]
a.append(src)
return a
def download_image(url):
file_path='D:/book/imge'
print(url)
file_name = url.split('/')[-1]
print(file_name)
try:
#是否有这个路径
if not os.path.exists(file_path):
#创建路径
os.makedirs(file_path)
#获得图片后缀
#拼接图片名(包含路径)
file_suffix = os.path.splitext(url)[1]
filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix)
print(filename)
#下载图片,并保存到文件夹中
urllib.request.urlretrieve(url,filename=filename)
except IOError as e:
print("IOError")
except Exception as e:
print("Exception")
if __name__ =="__main__":
n=0
pool = Pool()
a=[]
urllist =['http://www.meituba.com/xinggan/93486.html', 'http://www.meituba.com/xinggan/93494.html', 'http://www.meituba.com/xinggan/93479.html', 'http://www.meituba.com/xinggan/93497.html', 'http://www.meituba.com/xinggan/93503.html', 'http://www.meituba.com/xinggan/93510.html', 'http://www.meituba.com/xinggan/93512.html', 'http://www.meituba.com/xinggan/93504.html', 'http://www.meituba.com/xinggan/93506.html', 'http://www.meituba.com/xinggan/93505.html', 'http://www.meituba.com/xinggan/106333.html', 'http://www.meituba.com/xinggan/105890.html', 'http://www.meituba.com/xinggan/105886.html', 'http://www.meituba.com/xinggan/105882.html', 'http://www.meituba.com/xinggan/106381.html', 'http://www.meituba.com/xinggan/105885.html']
for i in urllist:
a.append(pool.apply_async(get_page,(i,)))
pool.close()
pool.join()
pool = Pool()
for i in a:
print(i.get())
pool.map(download_image,i.get())