import gevent
from gevent import monkey
monkey.patch_all()
import xlwt,xlrd,xlutils
from xlutils.copy import copy
import os,time
import requests
from parsel import Selector
class spider():
NextLink = []
book = xlwt.Workbook()
temp = 0
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:30.0) Gecko/20100101 Firefox/30.0','Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'}
baseUrl = "http://all.17k.com"
def getLinks(self):
# 得到小说的各种类型
html = requests.get('http://all.17k.com/lib/book/2_0_0_0_2_0_1_0_1.html',headers = self.headers)
html.encoding = html.apparent_encoding
html = Selector(text = html.text)
self.mainLinks = html.xpath('//dd[@class="allzplb"]/a[position() > 1 ]/@href').extract()
self.mainNames = html.xpath('//dd[@class="allzplb"]/a[position() > 1 ]/text()').extract()
for i in range(len(self.mainLinks)):
self.mainLinks[i] = self.baseUrl + self.mainLinks[i]
self.NextLink.append(['{}'.format(self.mainLinks[i]),'{}'.format(self.mainNames[i])])
return (self.NextLink,self.mainNames,self.mainLinks)
def createExcel(self,path):
if not os.path.exists(path):
os.makedirs(path,exist_ok = True)
for i in range(len(self.NextLink)):
sheet = self.book.add_sheet("{}".format(self.mainNames[i]))
self.book.save(path + "/" + "story.xls")
self.book1= xlrd.open_workbook("E:/spiderResult/story.xls")
self.newBook = copy(self.book1)
def getStoryInfo(self,storyLink,storyName,List):
#获取该类型链接的小说和名字
if self.temp == 0:
allData = [['storyCategory','storyName','storyLink','wordCounts','storyAuthor','storyState']]
else:
allData = []
html = requests.get(storyLink,headers = self.headers)
html.encoding = html.apparent_encoding
html = Selector(text = html.text)
self.storyCategory = html.xpath('//tbody/tr[position() > 1]/td[@class="td2"]/a/text()').extract()
self.storyLinks = html.xpath('//span/a[@class="jt"]/@href').extract()
self.storyNames = html.xpath('//span/a[@class="jt"]/text()').extract()
self.wordCounts = html.xpath('//tbody/tr[position() > 1]/td[@class="td5"]/text()').extract()
self.storyAuthors = html.xpath('//tbody/tr[position() > 1]/td[@class="td6"]/a/@title').extract()
self.storyState = html.xpath('//tbody/tr[position() > 1]/td[@class="td8"]/em/text()').extract()
Link = html.xpath('//div[@class="page"]/a/@href').extract()
Text = html.xpath('//div[@class="page"]/a/span/text()').extract()
for i in range(len(Link)):
if Text[i] == "下一页":
if Link[i] != "javascript:void(0);":
self.nextLink = self.baseUrl + Link[i]
self.NextLink.append(['{}'.format(self.nextLink),'{}'.format(storyName)])
break
for i in range(len(self.storyLinks)):
allData.append(['{}'.format(self.storyCategory[i]),'{}'.format(self.storyNames[i]),'{}'.format(self.storyLinks[i]),'{}'.format(self.wordCounts[i]),'{}'.format(self.storyAuthors[i]),'{}'.format(self.storyState[i].strip())])
List.remove(['{}'.format(storyLink),'{}'.format(storyName)])
sheet = self.newBook.get_sheet("{}".format(storyName))
if self.temp == 0:
row = (self.temp * 30)
else:
row = (self.temp * 30 + 1)
for datas in allData:
col = 0
for data in datas:
sheet.write(row,col,data)
col += 1
row += 1
print("{}第{}页储存完成!".format(storyName,str(self.temp + 1)))
self.newBook.save("E:/story.xls")
return (self.NextLink)
requests.close()
time.sleep(1)
def main(self,path):
# 运行程序
start = time.time()
self.getLinks()
self.createExcel(path)
while len(self.NextLink) != 0:
g = [gevent.spawn(self.getStoryInfo,self.NextLink[i][0],self.NextLink[i][1],self.NextLink) for i in range(len(self.NextLink))]
gevent.joinall(g)
self.temp += 1
time.sleep(1)
end = time.time()
print(end - start)
if __name__ == '__main__':
spider = spider()
spider.main("E:/spiderResult")
http://www.cnblogs.com/BigFishFly/p/6380016.html xpath用法
#分析定位html节点是获取信息的关键,用的是lxml模块,利用起lxml.html的xpath,
etree提供了HTML这个解析函数,
nodes=tree.xpath(u"//div[@id='leftmenu']/h3[text()='text']/following-sibling::ul[1]")
至于“following-sibling::”前缀就如其名所说,表示同一层的下一个节点”following-sibling::*”就是任意下一个节点,而“following-sibling::ul”就是下一个ul节点。
为了缩小定位范围,往往还需要增加过滤条件。过滤的方法就是用“[”“]”把过滤条件加上
函数text()的意思则是取得节点包含的文本。比如:<div>hello<p>world</p>< /div>中,用”div[text()='hello']“即可取得这个div,而world则是p的text()。
函数position()的意思是取得节点的位置。比如“li[position()=2]”表示取得第二个li节点,它也可以被省略为“li[2]”。