使用urllib爬取起点书籍信息

使用urllib爬取起点书籍信息

import urllib.request
from urllib import error 
from urllib.parse import quote

from bs4 import BeautifulSoup
from mylog import MyLog
import string
import codecs
import time
import random

from myresource import UserAgents
from myresource import PROXIES

from save2excel import SaveQiDianBooksInfo as sdbi
from save2mysql import SaveQiDianBooksInfo2Mysql as sqdni2m
 
import pybloom_live


bloom=pybloom_live.BloomFilter(capacity=10000,error_rate=0.0001) #查重

class BookItem(object):
    categoryName=None
    middleUrl=None
    bookName=None
    newestChapter=None
    wordsNum=None
    updateTime=None
    authorName=None
    

class GetQiDian(object):
    def __init__(self):
        #基地址少了 page,这样可以取第一页,再取总页数
        self.urlBase='https://www.qidian.com/all?orderId=&style=2&pagesize=20&siteid=1&pubflag=0&hiddenField=0&'
        self.mylog=MyLog()
        self.pages=self.getPages(self.urlBase)
        self.items=self.spider(self.urlBase,self.pages)
        self.pipelines(self.items)  #存txt文档
        #存excel
        sdbi(self.items)
        #存mysql
        sqdni2m(self.items)
        
    #取总页数
    def getPages(self,urlBase):
        
        htmlResponse=self.getResponseContent(urlBase)
        soup=BeautifulSoup(htmlResponse,"lxml")
        liTags=soup.find_all('li',attrs={"class":"lbf-pagination-item"})
        tags=liTags[-2].find("a",attrs={"class":"lbf-pagination-page "})    
        totalPages=int(tags.get_text().strip())
        self.mylog.debug("总页数为"+str(totalPages))  #这里设置错误级别要高于或等于mylog中设置的标准错误级别  如果mylog中设置的标准级别为error,则不会输出
        return totalPages
    
    
    #发出请求得到响应的数据
    def getResponseContent(self,url):
        try:
            time.sleep(random.randint(0,5))
            url=quote(url,safe=string.printable) #对url进行编码,解决  get请求时中文乱码的问题
            # response=urllib.request.urlopen(url)
           
            #设置代理
            proxy={'http':self.getRandomProxy()}
            #创建ProxyHandler
            proxy_support=urllib.request.ProxyHandler(proxy)
            #创建Opener
            opener=urllib.request.build_opener(proxy_support)
            #添加User-Agent
            opener.addheaders=[('User-Agent',self.getRandomHeader())]
            #安装Opener  有了安装,以后的访问都使用代理
            urllib.request.install_opener(opener)
            #使用自己安装好的Opener
            response=urllib.request.urlopen(url)
        except error.URLError as e:
            self.mylog.error('爬取%s失败,原因%s' %url,e)
        else:
            self.mylog.debug('爬取%s成功' %url)
            return response.read()
        
    #随机获取一个useragent 
    def getRandomHeader(self):
        return random.choice(UserAgents)
    
    #随机获取一个代理
    def getRandomProxy(self):
        return random.choice(PROXIES)
        
        
    def spider(self,baseUrl,pages):
        url=''
        items=[]
        for i in range(10):  #爬10页
            url=baseUrl+'page='+str(i)
            self.mylog.debug("开始爬取%s" % url)
            responseContent=self.getResponseContent(url)
            soup=BeautifulSoup(responseContent,"lxml")
            tableTag=soup.find('table',attrs={"class":"rank-table-list all"})
            tbodyTag=tableTag.find('tbody')
            trTags=tbodyTag.find_all('tr')
            for tag in trTags:
                bookItem=BookItem()
                bookItem.bookName=tag.find("a",attrs={"class":"name"}).get_text().strip()
                
                bookItem.categoryName=tag.find('a',attrs={'class':'type'}).get_text()+','+tag.find('a',attrs={'class':'go-sub-type'}).get_text()
                bookItem.middleUrl=tag.find('a',attrs={'class':'name'}).get('href')+"#Catalog"
                #bookItem.wordsNum=tag.find('span',attrs={'class':'total'}).get_text()
                bookItem.wordsNum=0
                bookItem.updateTime=tag.find('td',attrs={'class':'date'}).get_text()
                bookItem.authorName=tag.find('a',attrs={'class':'author'}).get_text()
                bookItem.bookId=tag.find('a',attrs={'class':'name'}).get('data-bid')
                
                if bookItem.bookId in bloom:
                    self.mylog.debug('爬取到重复的bookId' +bookItem.bookId)
                    break
                bloom.add(bookItem.bookId)
                
                bookItem.authorId=tag.find('a',attrs={'class':'author'}).get('href').split('/')[-1]
                bookItem.newestChapter=tag.find('a',attrs={'class':'chapter'}).get_text()
                items.append(bookItem)
                self.mylog.debug("爬取书名为<<%s>>信息成功" %bookItem.bookName)
        return items

    
    def pipelines(self,items):
        bookName='起点完本小说.txt'
        nowTime=time.strftime('%Y-%m-%d %H:%M:%S\r\n',time.localtime())
        with codecs.open(bookName,'w','utf8') as fp:
            fp.write('run time:%s' %nowTime)
            for item in items:
                fp.write('%s \t %s \t %s \t %s \t %s \t %s \r\n'
                         %(item.categoryName,item.bookName,item.middleUrl,item.wordsNum,item.updateTime,item.authorName))
                self.mylog.info(u'将书名为<<%s>>的数据存入"%s"...' %(item.bookName,bookName))
    
    
if __name__=='__main__':
    gqd=GetQiDian()

mylog.py如下:

import logging
import getpass
import sys


class MyLog(object):
    def __init__(self):
        self.user=getpass.getuser()
        self.logger=logging.getLogger(self.user)
        self.logger.setLevel(logging.DEBUG) 
        
        
        #取出日志名
        self.logName=sys.argv[0][0:-3]+'.log'
        
        #定义日志格式
        self.formatter=logging.Formatter('%(asctime)-12s %(filename)s %(funcName)s %(name)s %(message)s\r\n')
        
        #定义处理器
        self.fileHandler=logging.FileHandler(self.logName,encoding='utf8')
        self.fileHandler.setFormatter(self.formatter)
        self.fileHandler.setLevel(logging.DEBUG)
        
        self.streamHandler=logging.StreamHandler()
        self.streamHandler.setFormatter(self.formatter)
        self.streamHandler.setLevel(logging.DEBUG)
        
        #添加处理器
        self.logger.addHandler(self.fileHandler)
        self.logger.addHandler(self.streamHandler)
      
    #按级别输出的方法
    def debug(self,msg):
        self.logger.debug(msg)
        
    def error(self,msg):
        self.logger.error(msg)
        
    def warn(self,msg):
        self.logger.warn(msg)
        
    def info(self,msg):
        self.logger.info(msg)
        
    def critical(self,msg):
        self.logger.critical(msg)
        

if __name__=='__main__':
    m1=MyLog()

代理 myresource.py如下:

UserAgents=[
  "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
  "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
  "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
  "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
  "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
  "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
  "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
  "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
  "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]

#可以使用  Scrapy项目中的getProxy()来获取代理服务器列表。 
PROXIES=[
'58.20.238.103:9797',
'123.7.115.141:9797',
'121.12.149.18:2226',
'176.31.96.198:3128',
'61.129.129.72:8080',
'115.238.228.9:8080',
'124.232.148.3:3128',
'124.88.67.19:80',
'60.251.63.159:8080',
'118.180.15.152:8102'    
]

爬取数据存储到excel     save2excel.py如下:

import xlwt

class SaveQiDianBooksInfo(object):
    def __init__(self,items):
        self.items=items
        self.run(self.items)
        
    def run(self,items):
        fileName='起点.xls'
        wb=xlwt.Workbook()
        ws=wb.add_sheet('书籍信息')
        
        #写入第一行行头
        ws.write(0,0,'类别名')
        ws.write(0,1,'书籍地址')
        ws.write(0,2,'书籍编号')
        ws.write(0,3,'书籍名')
        ws.write(0,4,'最新章节')
        ws.write(0,5,'字数')
        ws.write(0,6,'更新时间')
        ws.write(0,7,'作者名')
        ws.write(0,8,'作者编号')
        
        for i in range(len(items)):
            item=items[i]
            ws.write(i+1,0,item.categoryName)
            ws.write(i+1,1,item.middleUrl)
            ws.write(i+1,2,item.bookId)
            ws.write(i+1,3,item.bookName)
            ws.write(i+1,4,item.newestChapter)
            ws.write(i+1,5,item.wordsNum)
            ws.write(i+1,6,item.updateTime)
            ws.write(i+1,7,item.authorName)
            ws.write(i+1,8,item.authorId)
            
        wb.save(fileName)

爬取数据存储到MySQL     save2mysql.py如下:

import pymysql


class SaveQiDianBooksInfo2Mysql(object):
    
    def __init__(self,items,host='localhost',port=3306,user='你的用户名',passwd='你的密码',db='qidiandb'):
        self.host=host
        self.port=port
        self.user=user
        self.passwd=passwd
        self.db=db
        self.run(items)
        
    def run(self,items):
        conn=pymysql.connect(host=self.host,port=self.port,user=self.user,passwd=self.passwd,db=self.db,charset='utf8')
        cur=conn.cursor()
        
       
        
        #TODO  mysql安装时默认的sql语句的缓存是不够大的
        params=[]
        for i in range(len(items)):
            item=items[i]
            params.append((item.bookId,item.categoryName,item.middleUrl,item.bookName,item.newestChapter,
                           item.wordsNum,item.updateTime,item.authorName,item.authorId))
            
            if(i+1)%1000==0:
                print('666666666666666666666')
                try:
                    sql='insert into bookitem(bookId,categoryName,middleUrl,bookName,newestChapter,wordsNum,updateTime,authorName,authorId) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
                    cur.executemany(sql,params)
                    conn.commit()
                    params=[]
                except Exception as e:
                    print(e)
                    conn.rollback()
        
        try:
            sql='insert into bookitem(bookId,categoryName,middleUrl,bookName,newestChapter,wordsNum,updateTime,authorName,authorId) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            cur.executemany(sql,params)
            conn.commit()
        except Exception as e:
            print(e)
            conn.rollback()
            
        
        cur.close()
        conn.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值