使用urllib爬取起点书籍信息
import urllib.request
from urllib import error
from urllib.parse import quote
from bs4 import BeautifulSoup
from mylog import MyLog
import string
import codecs
import time
import random
from myresource import UserAgents
from myresource import PROXIES
from save2excel import SaveQiDianBooksInfo as sdbi
from save2mysql import SaveQiDianBooksInfo2Mysql as sqdni2m
import pybloom_live
bloom=pybloom_live.BloomFilter(capacity=10000,error_rate=0.0001) #查重
class BookItem(object):
categoryName=None
middleUrl=None
bookName=None
newestChapter=None
wordsNum=None
updateTime=None
authorName=None
class GetQiDian(object):
def __init__(self):
#基地址少了 page,这样可以取第一页,再取总页数
self.urlBase='https://www.qidian.com/all?orderId=&style=2&pagesize=20&siteid=1&pubflag=0&hiddenField=0&'
self.mylog=MyLog()
self.pages=self.getPages(self.urlBase)
self.items=self.spider(self.urlBase,self.pages)
self.pipelines(self.items) #存txt文档
#存excel
sdbi(self.items)
#存mysql
sqdni2m(self.items)
#取总页数
def getPages(self,urlBase):
htmlResponse=self.getResponseContent(urlBase)
soup=BeautifulSoup(htmlResponse,"lxml")
liTags=soup.find_all('li',attrs={"class":"lbf-pagination-item"})
tags=liTags[-2].find("a",attrs={"class":"lbf-pagination-page "})
totalPages=int(tags.get_text().strip())
self.mylog.debug("总页数为"+str(totalPages)) #这里设置错误级别要高于或等于mylog中设置的标准错误级别 如果mylog中设置的标准级别为error,则不会输出
return totalPages
#发出请求得到响应的数据
def getResponseContent(self,url):
try:
time.sleep(random.randint(0,5))
url=quote(url,safe=string.printable) #对url进行编码,解决 get请求时中文乱码的问题
# response=urllib.request.urlopen(url)
#设置代理
proxy={'http':self.getRandomProxy()}
#创建ProxyHandler
proxy_support=urllib.request.ProxyHandler(proxy)
#创建Opener
opener=urllib.request.build_opener(proxy_support)
#添加User-Agent
opener.addheaders=[('User-Agent',self.getRandomHeader())]
#安装Opener 有了安装,以后的访问都使用代理
urllib.request.install_opener(opener)
#使用自己安装好的Opener
response=urllib.request.urlopen(url)
except error.URLError as e:
self.mylog.error('爬取%s失败,原因%s' %url,e)
else:
self.mylog.debug('爬取%s成功' %url)
return response.read()
#随机获取一个useragent
def getRandomHeader(self):
return random.choice(UserAgents)
#随机获取一个代理
def getRandomProxy(self):
return random.choice(PROXIES)
def spider(self,baseUrl,pages):
url=''
items=[]
for i in range(10): #爬10页
url=baseUrl+'page='+str(i)
self.mylog.debug("开始爬取%s" % url)
responseContent=self.getResponseContent(url)
soup=BeautifulSoup(responseContent,"lxml")
tableTag=soup.find('table',attrs={"class":"rank-table-list all"})
tbodyTag=tableTag.find('tbody')
trTags=tbodyTag.find_all('tr')
for tag in trTags:
bookItem=BookItem()
bookItem.bookName=tag.find("a",attrs={"class":"name"}).get_text().strip()
bookItem.categoryName=tag.find('a',attrs={'class':'type'}).get_text()+','+tag.find('a',attrs={'class':'go-sub-type'}).get_text()
bookItem.middleUrl=tag.find('a',attrs={'class':'name'}).get('href')+"#Catalog"
#bookItem.wordsNum=tag.find('span',attrs={'class':'total'}).get_text()
bookItem.wordsNum=0
bookItem.updateTime=tag.find('td',attrs={'class':'date'}).get_text()
bookItem.authorName=tag.find('a',attrs={'class':'author'}).get_text()
bookItem.bookId=tag.find('a',attrs={'class':'name'}).get('data-bid')
if bookItem.bookId in bloom:
self.mylog.debug('爬取到重复的bookId' +bookItem.bookId)
break
bloom.add(bookItem.bookId)
bookItem.authorId=tag.find('a',attrs={'class':'author'}).get('href').split('/')[-1]
bookItem.newestChapter=tag.find('a',attrs={'class':'chapter'}).get_text()
items.append(bookItem)
self.mylog.debug("爬取书名为<<%s>>信息成功" %bookItem.bookName)
return items
def pipelines(self,items):
bookName='起点完本小说.txt'
nowTime=time.strftime('%Y-%m-%d %H:%M:%S\r\n',time.localtime())
with codecs.open(bookName,'w','utf8') as fp:
fp.write('run time:%s' %nowTime)
for item in items:
fp.write('%s \t %s \t %s \t %s \t %s \t %s \r\n'
%(item.categoryName,item.bookName,item.middleUrl,item.wordsNum,item.updateTime,item.authorName))
self.mylog.info(u'将书名为<<%s>>的数据存入"%s"...' %(item.bookName,bookName))
if __name__=='__main__':
gqd=GetQiDian()
mylog.py如下:
import logging
import getpass
import sys
class MyLog(object):
def __init__(self):
self.user=getpass.getuser()
self.logger=logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#取出日志名
self.logName=sys.argv[0][0:-3]+'.log'
#定义日志格式
self.formatter=logging.Formatter('%(asctime)-12s %(filename)s %(funcName)s %(name)s %(message)s\r\n')
#定义处理器
self.fileHandler=logging.FileHandler(self.logName,encoding='utf8')
self.fileHandler.setFormatter(self.formatter)
self.fileHandler.setLevel(logging.DEBUG)
self.streamHandler=logging.StreamHandler()
self.streamHandler.setFormatter(self.formatter)
self.streamHandler.setLevel(logging.DEBUG)
#添加处理器
self.logger.addHandler(self.fileHandler)
self.logger.addHandler(self.streamHandler)
#按级别输出的方法
def debug(self,msg):
self.logger.debug(msg)
def error(self,msg):
self.logger.error(msg)
def warn(self,msg):
self.logger.warn(msg)
def info(self,msg):
self.logger.info(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__=='__main__':
m1=MyLog()
代理 myresource.py如下:
UserAgents=[
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
#可以使用 Scrapy项目中的getProxy()来获取代理服务器列表。
PROXIES=[
'58.20.238.103:9797',
'123.7.115.141:9797',
'121.12.149.18:2226',
'176.31.96.198:3128',
'61.129.129.72:8080',
'115.238.228.9:8080',
'124.232.148.3:3128',
'124.88.67.19:80',
'60.251.63.159:8080',
'118.180.15.152:8102'
]
爬取数据存储到excel save2excel.py如下:
import xlwt
class SaveQiDianBooksInfo(object):
def __init__(self,items):
self.items=items
self.run(self.items)
def run(self,items):
fileName='起点.xls'
wb=xlwt.Workbook()
ws=wb.add_sheet('书籍信息')
#写入第一行行头
ws.write(0,0,'类别名')
ws.write(0,1,'书籍地址')
ws.write(0,2,'书籍编号')
ws.write(0,3,'书籍名')
ws.write(0,4,'最新章节')
ws.write(0,5,'字数')
ws.write(0,6,'更新时间')
ws.write(0,7,'作者名')
ws.write(0,8,'作者编号')
for i in range(len(items)):
item=items[i]
ws.write(i+1,0,item.categoryName)
ws.write(i+1,1,item.middleUrl)
ws.write(i+1,2,item.bookId)
ws.write(i+1,3,item.bookName)
ws.write(i+1,4,item.newestChapter)
ws.write(i+1,5,item.wordsNum)
ws.write(i+1,6,item.updateTime)
ws.write(i+1,7,item.authorName)
ws.write(i+1,8,item.authorId)
wb.save(fileName)
爬取数据存储到MySQL save2mysql.py如下:
import pymysql
class SaveQiDianBooksInfo2Mysql(object):
def __init__(self,items,host='localhost',port=3306,user='你的用户名',passwd='你的密码',db='qidiandb'):
self.host=host
self.port=port
self.user=user
self.passwd=passwd
self.db=db
self.run(items)
def run(self,items):
conn=pymysql.connect(host=self.host,port=self.port,user=self.user,passwd=self.passwd,db=self.db,charset='utf8')
cur=conn.cursor()
#TODO mysql安装时默认的sql语句的缓存是不够大的
params=[]
for i in range(len(items)):
item=items[i]
params.append((item.bookId,item.categoryName,item.middleUrl,item.bookName,item.newestChapter,
item.wordsNum,item.updateTime,item.authorName,item.authorId))
if(i+1)%1000==0:
print('666666666666666666666')
try:
sql='insert into bookitem(bookId,categoryName,middleUrl,bookName,newestChapter,wordsNum,updateTime,authorName,authorId) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cur.executemany(sql,params)
conn.commit()
params=[]
except Exception as e:
print(e)
conn.rollback()
try:
sql='insert into bookitem(bookId,categoryName,middleUrl,bookName,newestChapter,wordsNum,updateTime,authorName,authorId) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cur.executemany(sql,params)
conn.commit()
except Exception as e:
print(e)
conn.rollback()
cur.close()
conn.close()