#-*- coding: utf-8 -*-
importscrapyimportosfrom qidian.items importQidianItem, Zhangjie, BookItemclassXiaoshuoSpider(scrapy.Spider):
name= 'xiaoshuo'allowed_domains= ["qidian.com"]#开始爬取的页面
start_urls = ['https://www.qidian.com/free/all?orderId=&page=1&vip=hidden&style=1&pageSize=50&siteid=1&pubflag=0&hiddenField=1']defparse(self, response):#xpath定位html元素
li_list = response.xpath("//div[@class='book-img-text']//ul/li")for li in li_list: #遍历当前页每一本书
item = QidianItem() #实例化
#书名
item["book_name"] = li.xpath("./div[@class='book-mid-info']/h4/a/text()").extract_first()#作者
item["book_author"] = li.xpath("./div[@class='book-mid-info']/p[@class='author']/a[@class='name']/text()").extract_first()#书的类型,玄幻,修真,都市之类的
item["book_type"] = li.xpath("./div[@class='book-mid-info']/p[@class='author']/a[@data-eid='qd_B60']/text()").extract_first()#书的状态,,连载还是完结
item["book_status"] = li.xpath("./div[@class='book-mid-info']/p[@class='author']/span/text()").extract_first()#书的封面图片
item["book_img"] = "http:" + li.xpath("./div[@class='book-img-box']/a/img/@src").extract_first()#书的url
item["book_url"] ="http:" + li.xpath("./div[@class='book-img-box']/a/@href").extract_first()yieldscrapy.Request(
item["book_url"],
callback= self.parseBookUrl, #通过每一本书的url地址,获得每一本书的目录信息
#传递数据给parseBookUrl,字典的形式,这里只传递书名和作者
meta={"book_name":item["book_name"].strip(),"book_author":item["book_author"].strip()} #把每一本书的书名传递给下一个
)#下一页
page_num = len(response.xpath("//div[@class='pagination fr']//ul/li"))for i in range(page_num - 2): #只有5页,就这么做了
next_url = 'http://www.qidian.com/free/all?orderId=&vip=hidden&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=1&page={}'.format(i+1)yieldscrapy.Request(
next_url,
callback=self.parse
)defparseBookUrl(self,response):#接收数据
bookname = response.meta["book_name"]
bookauthor= response.meta["book_author"]#实例化,赋值
zhangjie =Zhangjie()
zhangjie["book_name"] =bookname
zhangjie["book_author"] =bookauthor
book_div= response.xpath("//div[@class='volume-wrap']//div[@class='volume']")
i=0for div in book_div: #遍历小说卷数
li_list = div.xpath("./ul/li")for li in li_list: #遍历小说的章节数
i = i + 1zhangjie["zhangjie_id"] =i
zhangjie["book_zhangjie"] = li.xpath("./a/text()").extract_first()
zhangjie["book_zhangjie_url"] ="http:" + li.xpath("./a/@href").extract_first() #通过目录的url地址爬取文本内容
yieldscrapy.Request(
zhangjie["book_zhangjie_url"],
callback=self.parseZhangjieUrl,#meta的传递数据给parseZhangjieUrl
meta={"book_name":zhangjie["book_name"].strip(),"zhangjie":zhangjie["book_zhangjie"].strip(),"book_author":bookauthor,"zhangjie_id":zhangjie["zhangjie_id"] }
)defparseZhangjieUrl(self,response):#接收传递过来的数据
bookname = response.meta["book_name"]
zhangjie= response.meta["zhangjie"]
book_author= response.meta["book_author"]
zhangjie_id=response.meta["zhangjie_id"]
bookitem=BookItem()
bookitem["book_name"] =bookname
bookitem["book_mulu"] =zhangjie
bookitem["book_author"] =book_author
bookitem["zhangjie_id"] =zhangjie_id
div_list= response.xpath("//div[@class='main-text-wrap']")
p_list= div_list.xpath("./div[@class='read-content j_readContent']/p")
file_path= "./books/"+bookname+ "/" + bookitem["book_mulu"] + ".txt"content= ""
#遍历每一p标签获取p中的内容,并组装成文章
for p inp_list:
content+= p.xpath("./text()").extract_first() +"\n\n"bookitem["book_text"] =content#判断路径是否存在
if os.path.exists("./books/"+ bookname) ==False:
os.makedirs("./books/"+bookname)#写入txt
with open(file_path, 'a', encoding='utf-8') as f:
f.write(" " + bookitem["book_mulu"] + "\n\n\n\n" +content)yield bookitem #把数据提交给pipelines.py