Python crawler based on python 3

初学python,学着写了一个爬虫玩玩

这个爬虫能够爬取千图网上指定的一部分图片,因为千图网更新速度很快,所以本爬虫可能有时限性

这里用到的知识点有:

1. requests库

2. BeautifulSoup库

3. os库

import requests
from bs4 import BeautifulSoup
import re
import os

class bgCrawler():
	def main(self):
		print("******任务开始!******")

		html = self.get('http://588ku.com')

		soup = self.soup(html)

		bgUl = soup.find('ul', class_ = 'st-bgAll-list').find_all('li')

		for li in bgUl:
			self.liLink(li)
			print('下载完成')
		print("******任务结束!******")
	def liLink(self, li):

		name = li.find('div', class_ = 'st-name').find('h2').get_text()
		num = li.find('div', class_ = 'st-number').find('div', class_ = 'number-info').get_text()
		name = name + ' ' + num
		self.mkdir(name)

		picHtml = li.find('div', class_ = 'st-list-box st-list-l').find('a')['href']
		picWeb = self.get(picHtml)
		picSoup = self.soup(picWeb)

		page = self.page(picSoup)
		if page == None:
			page = [picHtml]
		else:
			page.insert(0, picHtml)
		#download pic
		for pg in page:
			self.img(pg)

	def soup(self, html):
		soup = BeautifulSoup(html.text, 'lxml')
		return soup

	def get(self, url):
		headers = {
			'User-Agent' :  'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
			'referer' : "www.baidu.com" 
		}
		html = requests.get(url, headers = headers)
		return html

	def img(self, src):	#TODO
		imgHtml = self.get(src)
		imgSoup = self.soup(imgHtml)
		img = imgSoup.find('ul', class_ = 'clearfix bg-vertical-box bg-endways-hoverBtn').find_all('li', class_ = 'pic-list fl')
		for i in img:
			imgAnc = i.find('a')['href']
			imgHtm = self.get(imgAnc)
			imgHtmSoup = self.soup(imgHtm)
			link = imgHtmSoup.find('div', class_ = 'img-l-box').find('img')['src']
			self.save(link)

	def page(self, soup):
		pageAnchor = soup.find('div', class_ = 'page-con w1200')
		if pageAnchor == None:
			return pageAnchor
		else:
			pageAnchor = pageAnchor.find_all('a')
			pageAnchor = pageAnchor[1 : -1]
			pageArr = []
			for anc in pageAnchor:
				pageArr.append(anc['href'])
			return pageArr

	def mkdir(self, path):
		path.strip()
		if os.path.exists(os.path.join(r"C:\背景", path)):
			print(path, '文件夹已存在, 直接保存图片')
			os.chdir(os.path.join(r"C:\背景", path))
			return False
		else:
			print("建立了一个名字叫做", path, "的文件夹")
			os.makedirs(os.path.join(r"C:\背景", path))
			os.chdir(os.path.join(r"C:\背景", path))
			return True

	def save(self, img):
		content = self.get(img)
		name = img[40:55]
		name = name.replace(':', '')
		name = name.replace(' ', '')
		file = open(name + '.jpg', 'ab')
		file.write(content.content)
		file.close()

if __name__ == '__main__':
	crawler = bgCrawler()
	crawler.main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值