1.在workspace文件夹下新建spider.py文件
2.导入需要的库包,代码如下:
import requests,os,re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from functools import partial
3.编写主函数,代码如下:
def main():
index_url='https://www.meitulu.com/'
pass
if __name__ == '__main__':
main()
4.编写获取首页图集链接函数,代码如下:
def get_all_page(index_url):
response=requests.get(index_url)
if response.status_code==200:
soup=BeautifulSoup(response.content,'lxml')
items=soup.find('ul',class_='img').find_all('li')
return [item.find('a')['href'] for item in items]
5.编写解析图集链接的函数,代码如下:
def parse_current_page(url,name):
response = requests.get(url)
response.encoding = 'utf-8'
if response.status_code == 200:
soup=BeautifulSoup(response.content,'lxml')#解析器
img_items=soup.find('div',class_='content').find_all('img')#获取到图片所有项
img_srcs=[item['src'] for item in img_items]#获取到所有图片链接
dir_path = 'E:\\pictures\\'+name #图集路径
if not os.path.exists(dir_path):
os.mkdir(dir_path)
pool=ThreadPool(4)#创建4个线程
pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)#partial偏函数可以传递多个参数到函数中去
pool.close()
pool.join()
next_page=soup.find('div',id="pages").find_all('a')[-1]#获取下一页项
next_page=next_page['href']#获取下一页链接
current_page_num=soup.find('div',id="pages").find('span').text#目前页面底部高亮页码
next_page_num=re.findall('\d+',next_page)[1]#正则匹配到链接中的页码
if next_page_num!=current_page_num: #如果不相同即代表还未到下一页
next_page="https://www.meitulu.com"+next_page#加上前缀
return parse_current_page(next_page)#循环爬取
else:
return#终止运行
关于 pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)中的partial函数解释如下(网上百度的):
map(参数1,参数2),这个函数里传递的是两个参数第一个参数是一个函数名,特别注意的是不带函数后面的(),第二个参数是个迭代器,如以下代码:
In[18]: def fun(t):
return t
[i for i in map(fun,['a','b','c'])]
Out[18]: ['a', 'b', 'c']
map函数在python2返回的是列表,python3返回的是迭代器,节约内存。
一般情况第二个参数,传入一个迭代就够用了。由于这里需要传递一个存储路径,所以需要partial来传递多个参数。
6.编写保存图片函数,代码如下:
def save_picture(img_url,referer,path):
file_name=path+os.sep+img_url.split('/')[-1]
with open(file_name, 'wb+') as f:
headers=headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
'Connection': "keep-alive",
'Cookie': 'UM_distinctid=169241b82e3d7-055c03980912f78-4c312f7f-e1000-169241b82e4370',
'Referer':referer,
}
response= requests.get(img_url,headers=headers)
f.write(response.content)
f.close()
这里涉及到基本的反爬虫知识,如果在访问图片时不加请求头的话是会出现下载的图片只要1kb,原因就是服务器那边设置了反爬虫,请求头里面有个重要的参数就是referer,即该页面来源于主页面,这里我传入的是图集的链接。
7.完整代码如下:
import requests,os,re,time
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from functools import partial
def save_picture(img_url,referer,path):
file_name=path+os.sep+img_url.split('/')[-1]
with open(file_name, 'wb+') as f:
headers=headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
'Connection': "keep-alive",
'Cookie': 'UM_distinctid=169241b82e3d7-055c03980912f78-4c312f7f-e1000-169241b82e4370',
'Referer':referer,
}
response= requests.get(img_url,headers=headers)
f.write(response.content)
f.close()
time.sleep(1)
def parse_current_page(url,name):
response = requests.get(url)
response.encoding = 'utf-8'
if response.status_code == 200:
soup=BeautifulSoup(response.content,'lxml')
img_items=soup.find('div',class_='content').find_all('img')
img_srcs=[item['src'] for item in img_items]
dir_path = 'E:\\pictures\\'+name
if not os.path.exists(dir_path):
os.mkdir(dir_path)
pool=ThreadPool(4)
pool.map(partial(save_picture,referer=url,path=dir_path),img_srcs)
pool.close()
pool.join()
next_page=soup.find('div',id="pages").find_all('a')[-1]#
next_page=next_page['href']
current_page_num=soup.find('div',id="pages").find('span').text
next_page_num=re.findall('\d+',next_page)[1]
if next_page_num!=current_page_num:
next_page="https://www.meitulu.com"+next_page
return parse_current_page(next_page)
else:
return
def get_all_page(index_url):
response=requests.get(index_url)
if response.status_code==200:
soup=BeautifulSoup(response.content,'lxml')
items=soup.find('ul',class_='img').find_all('li')
_list=[(item.find('a')['href'],item.find_all('p')[1].text) for item in items]
return _list
def main():
index_url='https://www.meitulu.com/'
for page in get_all_page(index_url):
parse_current_page(page[0],page[1])
if __name__ == '__main__':
main()