Python实例：网络爬虫抓取豆瓣3万本书（7）

最新推荐文章于 2020-11-23 12:50:00 发布

肥宝Fable

最新推荐文章于 2020-11-23 12:50:00 发布

阅读量3.1k

点赞数 1

分类专栏： Python 肥宝的实验室文章标签：网络爬虫 python 实例

本文链接：https://blog.csdn.net/u012175089/article/details/62420097

版权

肥宝的实验室同时被 2 个专栏收录

175 篇文章 18 订阅

订阅专栏

Python

41 篇文章 3 订阅

订阅专栏

# -*- coding:utf-8 -*-
from tool.HtmlManager import getHtml,getBinaryHtml
import time
import os.path
from tool.ExcelManager import validateTitle 
import re
from tool.ProxyManager import makeProxyAddress
from tool.DbManager import DbManager  
web504=['1010668','1023322','10459781','1915375']
# 抓取图书详情页
'''
# 读取book表，读取booktag表，抓取图书网页拷贝多份到不同标签目录
def catchbook(requreip = 0, v=0,startbook=0):
	"""
	输入参数为:
	是否使用代理，默认否
	是否限制爬虫速度，默认否，时间为1秒仿人工
	startbook = 0 查询起始位置,如果处理过程失败，可以抽取数据库第startbook条数据之后进行爬取
	"""
'''
requreip = 0
v=2
startbook=0


# 第7步:抓取图书
# 读取book表，读取booktag表，抓取图书网页拷贝多份到不同标签目录




# 进行计时
start = time.clock()
webe=[]
selecttotal = 'select count(distinct bookno) from booktag'
selectsql = 'SELECT bookname,bookkind,bookno FROM booktag group by bookno'
dbManager = DbManager()
total = dbManager.execQuery(selecttotal)# 总记录
total=int(total[0][0])
daili0 = makeProxyAddress()# 代理IP数组
dailino = 0
changeip = 0  # 代理ip下标
# 循环对分类进行抓取
while startbook < total+100:
	selectsql1=selectsql+' limit '+str(startbook)+',100'
	taglist=dbManager.execQuery(selectsql1)
	for i in range(0,len(taglist)):
		try:
			bookname = taglist[i][0]
			kinds = taglist[i][1] # 分类
			bookno = taglist[i][2] # 图书编号
			url = 'http://book.douban.com/subject/'+bookno # 抓取网址
			#http://book.douban.com/subject/25862578
		except:
			raise 
		mulu0 = '../book/'+kinds
		# 存在大分类文件夹则跳过
		if os.path.exists(mulu0):
			pass
		else: # 否则新建
			print('新建大分类：'+mulu0)
			os.makedirs(mulu0)
		# 判断文件是否存在，存在则不抓取节省时间
		try:
			filename =mulu0+'/'+bookno+validateTitle(bookname)+'.html'
			if(os.path.exists(filename) == True):
				print(filename+'：已经存在')
				continue
			elif bookno in web504:
				# 写入本地文件
				print('----'*5)
				print("504错误,跳过："+bookno)
				print('----'*5)
				continue
			else:
				#print("-"*50)
				print('准备抓取：'+url+'类别：'+kinds)
		except:
			print(filename+"文件名异常")
			continue
		iprefuse = 1  # 如果抓取成功设为0
		# 如果抓取出错那重新抓取
		while iprefuse:
			try:
				daili1= daili0[changeip]  # 代理ip
				# 爬虫速度控制
				if v:
					a = time.clock()
					time.sleep(v)
					b = time.clock()
					print('时间暂停:'+str(v))
					print('真实时间暂停（Unix CPU时间,Windows 真实时间):'+str(b-a))
				# 不需要代理
				if requreip==0:
					webcontent = getHtml(url).encode('utf-8') # 爬取，有时间限制，应对504错误
					notnull = re.search(r'<div class="top-nav-doubanapp">',webcontent.decode('utf-8','ignore'))
					if notnull:
						pass
					else:
						raise Exception("抓到的页面不是正确的页面"+filename)
					webfile = open(filename, 'wb')
					webfile.write(webcontent)
					webfile.close()
					print("已经抓取:"+url+'类别：'+kinds)
					iprefuse = 0 # 抓完设置0
				else: # 需要代理
					print('代理：'+daili1)
					webcontent = getBinaryHtml(url, daili1)
					notnull = re.search(r'<div class="top-nav-doubanapp">',webcontent.decode('utf-8','ignore'))
					if notnull:
						pass
					else:
						raise Exception("抓到的页面不是正确的页面"+filename)
					webfile = open(filename, 'wb')
					webfile.write(webcontent)
					webfile.close()
					print("已经抓取:"+url+'类别：'+kinds)
					iprefuse = 0
					dailino=dailino+1
					print('此次转换代理次数:'+str(dailino))
					if dailino>20:
						dailino=0
						requreip=0 # 代理100次后转为非代理
			#except urllib.error.URLError as e:
			except Exception as e:
				print(url)
				if hasattr(e, 'code'):
					print('页面不存在或时间太长.')
					print('Error code:', e.code)
					if e.code==404:
						print('404错误，忽略')
						webe.append(bookno)
						break
				elif hasattr(e, 'reason'):
					print("无法到达主机.")
					print('Reason:  ', e.reason)
				print(e)
				if requreip:
					changeip = changeip+1 # 更换ip下标
					if changeip==len(daili0): # 到达ip数组末循环再来
						changeip = 0
					print('更换代理：'+daili0[changeip])
					dailino=dailino+1
					print('此次转换代理次数:'+str(dailino))
					if dailino>20:
						dailino=0
						requreip=0 # 代理100次后转为非代理
				else:
					print("IP被封或断网")
					requreip=1 # 转为代理
	print('已经抓了'+str(startbook+100)+'本')
	print()
	print()
	print()
	startbook=startbook+100
	if len(webe) > 20:
		print(webe)
		webep=open("../book/book.txt",'a+')
		webep.write(','.join(webe)+'/n')
		webep.close()
		webe=[]
	else:
		pass


end = time.clock()
print("爬取总共运行时间 : %.03f 秒" %(end-start))