本文的主要目的是练习Beautiful Soup的使用,目标是抓取百思不得姐前几页的图片并保存在本地
我的电脑是win7 64位,IDE为PyCham。
首先安装BeautifulSoup和requests ,这里直接pip安装或者PyCham安装即可
接下来开始写代码,代码分为几个部分:
- 生成url
- 获取响应,并判断响应状态码
- 用BeautifulSoup筛选得到的html代码,得到我们需要的图片名字和链接
- 保存到本地
全部代码如下:
import re
import requests
from bs4 import BeautifulSoup as SB
class Get_BaiSi_img():
def __init__(self,ini_url,n):
self.ini_url=ini_url
self.url=''
self.html=''
self.content=''
self.aims=''
self.error=0
self.sucess_num=0
self.fail_num=0
self.n=n
self.headers={'Host': 'mstatic.spriteapp.cn',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0' ,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':' zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive'}
def create_url(self,i):
self.url=self.ini_url+str(i+1)
def get_aim_url(self):
self.html=requests.get(url=self.url)
if self.html.status_code==200:
self.content=SB(self.html.text,"lxml")
self.aims=self.content.find_all("img",attrs={"class":"lazy"})
self.sucess_num=self.sucess_num+1
else:
print(self.html.status_code,"连接错误")
self.aims=''
print(self.html.url)
self.fail_num=self.fail_num+1
def download_img(self,i):
patton=re.compile("http://.*?_mini.*?",re.S)
j=1
if self.aims:
for aim in self.aims:
try:
if re.match(patton, aim.attrs['data-original']):
continue
else:
try:
with open(aim.attrs['alt'] + ".jpg", 'wb') as f:
f.write(requests.get(aim.attrs['data-original']).content)
print(str(i+1)+"-"+str(j)+"写入成功!")
j = j + 1
except:
print("写入失败!")
except KeyError:
self.error = self.error + 1
def main(self):
for i in range(self.n):
self.create_url(i)
self.get_aim_url()
self.download_img(i)
if __name__=="__main__":
url="http://www.budejie.com/"
pp=Get_BaiSi_img(url,100)
pp.main()
print("成功数: "+str(pp.sucess_num))
print("失败数: "+str(pp.fail_num))
大概就是这样了,算是比较简单的爬虫入门练习吧