初学python,学着写了一个爬虫玩玩
这个爬虫能够爬取千图网上指定的一部分图片,因为千图网更新速度很快,所以本爬虫可能有时限性
这里用到的知识点有:
1. requests库
3. os库
import requests
from bs4 import BeautifulSoup
import re
import os
class bgCrawler():
def main(self):
print("******任务开始!******")
html = self.get('http://588ku.com')
soup = self.soup(html)
bgUl = soup.find('ul', class_ = 'st-bgAll-list').find_all('li')
for li in bgUl:
self.liLink(li)
print('下载完成')
print("******任务结束!******")
def liLink(self, li):
name = li.find('div', class_ = 'st-name').find('h2').get_text()
num = li.find('div', class_ = 'st-number').find('div', class_ = 'number-info').get_text()
name = name + ' ' + num
self.mkdir(name)
picHtml = li.find('div', class_ = 'st-list-box st-list-l').find('a')['href']
picWeb = self.get(picHtml)
picSoup = self.soup(picWeb)
page = self.page(picSoup)
if page == None:
page = [picHtml]
else:
page.insert(0, picHtml)
#download pic
for pg in page:
self.img(pg)
def soup(self, html):
soup = BeautifulSoup(html.text, 'lxml')
return soup
def get(self, url):
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'referer' : "www.baidu.com"
}
html = requests.get(url, headers = headers)
return html
def img(self, src): #TODO
imgHtml = self.get(src)
imgSoup = self.soup(imgHtml)
img = imgSoup.find('ul', class_ = 'clearfix bg-vertical-box bg-endways-hoverBtn').find_all('li', class_ = 'pic-list fl')
for i in img:
imgAnc = i.find('a')['href']
imgHtm = self.get(imgAnc)
imgHtmSoup = self.soup(imgHtm)
link = imgHtmSoup.find('div', class_ = 'img-l-box').find('img')['src']
self.save(link)
def page(self, soup):
pageAnchor = soup.find('div', class_ = 'page-con w1200')
if pageAnchor == None:
return pageAnchor
else:
pageAnchor = pageAnchor.find_all('a')
pageAnchor = pageAnchor[1 : -1]
pageArr = []
for anc in pageAnchor:
pageArr.append(anc['href'])
return pageArr
def mkdir(self, path):
path.strip()
if os.path.exists(os.path.join(r"C:\背景", path)):
print(path, '文件夹已存在, 直接保存图片')
os.chdir(os.path.join(r"C:\背景", path))
return False
else:
print("建立了一个名字叫做", path, "的文件夹")
os.makedirs(os.path.join(r"C:\背景", path))
os.chdir(os.path.join(r"C:\背景", path))
return True
def save(self, img):
content = self.get(img)
name = img[40:55]
name = name.replace(':', '')
name = name.replace(' ', '')
file = open(name + '.jpg', 'ab')
file.write(content.content)
file.close()
if __name__ == '__main__':
crawler = bgCrawler()
crawler.main()