python学习之爬取美女图片
本文学习使用requests和BeautifulSoup爬取http://sc.chinaz.com/tupian/meinvxiezhen.html美女图片。
首先定义download函数:
def download(url,headers,nuw_retries=3):
try:
response = requests.get(url,headers=headers)
#print(response.status_code)
if response.status_code == 200:
return response.content
return None
except RequestException as e:
print(e.response)
html=''
if hasattr(e.response,'status_code'):
code = e.response.status_code
print('errorcode',code)
if num_retries >0 and 500<= code <600:
html = download(url,headers,num_retries - 1)
else:
code = None
return html
右键点击图片选择检查:
这时候可以看到图片链接http://sc.chinaz.com/tupian/200414025394.htm是在 div class="text_left text_lefts"中的a标签和p标签中,
def find_imgaddr(url, headers):
r = download(url, headers=headers)
page = BeautifulSoup(r, "lxml")
all_links = page.find('div',attrs={'class':'text_left text_lefts'}).find_all('p')#获取p标签的内容
这里我首先用的find_all(‘a’),不过由于a标签和p标签下的a标签内的链接是重复的,所以改为p标签
现在打开图片链接http://sc.chinaz.com/tupian/200414025394.htm,右键‘查看全图’检查:
获取jpg地址链接:
for i in all_links:
url = i.find('a')['href'] #获取p标签内a标签的htm链接
h = download(url,headers)
img_page = BeautifulSoup(h, "lxml")
all_imgs = img_page.find('div',attrs={'class':'down_wrap'}).find('span',attrs={'class':'img_open'}).find('a')['href'] #获取htm链接内的jpg地址链接
然后将文件写入本地:
filename = all_imgs.split('/')[-1]
with open(filename,'wb') as f:
content = download(all_imgs,headers)#调用download函数获取JPG内容
f.write(content)
再次观察页面,第一页的链接是:
http://sc.chinaz.com/tupian/meinvxiezhen.html
第二页、第三页为:
http://sc.chinaz.com/tupian/meinvxiezhen_2.html
http://sc.chinaz.com/tupian/meinvxiezhen_3.html
那么第二页第三页之后可以写为:
for i in range(2,19):
url = 'http://sc.chinaz.com/tupian/meinvxiezhen_'+str(i)+'.html'
创建保存图片的文件夹img:
import os
os.mkdir('img')
os.chdir('img')
定义header:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'Referer':'http://sc.chinaz.com/tupian/meinvxiezhen.html'
}
完整代码如下:
import requests
from bs4 import BeautifulSoup
import os
def download(url,headers,nuw_retries=3):
try:
response = requests.get(url,headers=headers)
#print(response.status_code)
if response.status_code == 200:
return response.content
return None
except RequestException as e:
print(e.response)
html=''
if hasattr(e.response,'status_code'):
code = e.response.status_code
print('errorcode',code)
if num_retries >0 and 500<= code <600:
html = download(url,headers,num_retries - 1)
else:
code = None
return html
def find_imgaddr(url, headers):
r = download(url, headers=headers)
page = BeautifulSoup(r, "lxml")
all_links = page.find('div',attrs={'class':'text_left text_lefts'}).find_all('p')#获取p标签的内容
for i in all_links:
url = i.find('a')['href'] #获取p标签内的htm链接
h = download(url,headers)
img_page = BeautifulSoup(h, "lxml")
all_imgs = img_page.find('div',attrs={'class':'down_wrap'}).find('span',attrs={'class':'img_open'}).find('a')['href'] #获取htm链接内的jpg地址链接
filename = all_imgs.split('/')[-1]
with open(filename,'wb') as f:
content = download(all_imgs,headers)#调用download函数获取JPG内容
f.write(content)
def main():
url ='http://sc.chinaz.com/tupian/meinvxiezhen.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'Referer':'http://sc.chinaz.com/tupian/meinvxiezhen.html'
}
find_imgaddr(url=url, headers=headers)
#页面循环
for i in range(2,19):
url = 'http://sc.chinaz.com/tupian/meinvxiezhen_'+str(i)+'.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'Referer':'http://sc.chinaz.com/tupian/meinvxiezhen.html'
}
find_imgaddr(url=url, headers=headers)
if __name__ =='__main__':
os.mkdir('img')
os.chdir('img')
main()