朋友爬虫实例（采用协程池的方式）运用了Python的class

最新推荐文章于 2023-04-19 16:44:40 发布

星星火_

最新推荐文章于 2023-04-19 16:44:40 发布

阅读量830

点赞数

文章标签：爬虫 xpath Python 类

本文链接：https://blog.csdn.net/qq_42276808/article/details/83115073

版权

import gevent
from gevent import monkey
monkey.patch_all()
import xlwt,xlrd,xlutils
from xlutils.copy import copy
import os,time
import requests
from parsel import Selector
class spider():
   NextLink = []
   book = xlwt.Workbook()
   temp = 0
   headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:30.0) Gecko/20100101 Firefox/30.0','Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'}
   baseUrl = "http://all.17k.com"
   def getLinks(self):
       # 得到小说的各种类型
       html = requests.get('http://all.17k.com/lib/book/2_0_0_0_2_0_1_0_1.html',headers = self.headers)
       html.encoding = html.apparent_encoding
       html = Selector(text = html.text)
       self.mainLinks = html.xpath('//dd[@class="allzplb"]/a[position() > 1 ]/@href').extract()
       self.mainNames = html.xpath('//dd[@class="allzplb"]/a[position() > 1 ]/text()').extract()
       for i in range(len(self.mainLinks)):
           self.mainLinks[i] = self.baseUrl + self.mainLinks[i]
           self.NextLink.append(['{}'.format(self.mainLinks[i]),'{}'.format(self.mainNames[i])])
       return (self.NextLink,self.mainNames,self.mainLinks)
   def createExcel(self,path):
       if not os.path.exists(path):
           os.makedirs(path,exist_ok = True)
       for i in range(len(self.NextLink)):
           sheet = self.book.add_sheet("{}".format(self.mainNames[i]))
       self.book.save(path + "/" + "story.xls")
       self.book1= xlrd.open_workbook("E:/spiderResult/story.xls")
       self.newBook = copy(self.book1)
   def getStoryInfo(self,storyLink,storyName,List):
       #获取该类型链接的小说和名字
       if self.temp == 0:
           allData = [['storyCategory','storyName','storyLink','wordCounts','storyAuthor','storyState']]
       else:
           allData = []
       html = requests.get(storyLink,headers = self.headers)
       html.encoding = html.apparent_encoding
       html = Selector(text = html.text)
       self.storyCategory = html.xpath('//tbody/tr[position() > 1]/td[@class="td2"]/a/text()').extract()
       self.storyLinks = html.xpath('//span/a[@class="jt"]/@href').extract()
       self.storyNames = html.xpath('//span/a[@class="jt"]/text()').extract()
       self.wordCounts = html.xpath('//tbody/tr[position() > 1]/td[@class="td5"]/text()').extract()
       self.storyAuthors = html.xpath('//tbody/tr[position() > 1]/td[@class="td6"]/a/@title').extract()
       self.storyState = html.xpath('//tbody/tr[position() > 1]/td[@class="td8"]/em/text()').extract()
       Link = html.xpath('//div[@class="page"]/a/@href').extract()
       Text = html.xpath('//div[@class="page"]/a/span/text()').extract()
       for i in range(len(Link)):
           if Text[i] == "下一页":
               if Link[i] != "javascript:void(0);":
                   self.nextLink = self.baseUrl + Link[i]
                   self.NextLink.append(['{}'.format(self.nextLink),'{}'.format(storyName)])
                   break
       for i in range(len(self.storyLinks)):
           allData.append(['{}'.format(self.storyCategory[i]),'{}'.format(self.storyNames[i]),'{}'.format(self.storyLinks[i]),'{}'.format(self.wordCounts[i]),'{}'.format(self.storyAuthors[i]),'{}'.format(self.storyState[i].strip())])
       List.remove(['{}'.format(storyLink),'{}'.format(storyName)])
       sheet = self.newBook.get_sheet("{}".format(storyName))
       if self.temp == 0:
           row = (self.temp * 30)
       else:
           row = (self.temp * 30 + 1)
       for datas in allData:
           col = 0
           for data in datas:
               sheet.write(row,col,data)
               col += 1
           row += 1
       print("{}第{}页储存完成!".format(storyName,str(self.temp + 1)))
       self.newBook.save("E:/story.xls")
       return (self.NextLink)
       requests.close()
       time.sleep(1)
   def main(self,path):
       # 运行程序
       start = time.time()
       self.getLinks()
       self.createExcel(path)
       while len(self.NextLink) != 0:
           g = [gevent.spawn(self.getStoryInfo,self.NextLink[i][0],self.NextLink[i][1],self.NextLink) for i in range(len(self.NextLink))]
           gevent.joinall(g)
           self.temp += 1
           time.sleep(1)
       end = time.time()
       print(end - start)
if __name__ == '__main__':
   spider = spider()
   spider.main("E:/spiderResult")