# -*- coding: utf-8 -*-import scrapy
from epub.items import EpubItem
classEpubdownloadSpider(scrapy.Spider):
name ='epubdownload'# allowed_domains = ['https://www.ixdzs.com/sort/1/index_0_2_0_1.html']
start_urls =['http://www.ixdzs.com/sort/1/index_0_2_0_'+str(i)+'.html/'for i inrange(1,51)]defparse(self, response):
href=response.xpath('//a[contains(@href,"/d") and contains(@href,"epub_down")]/@href').extract()for i inrange(len(href)):
url ='http://www.ixdzs.com/'+href[i]yield scrapy.Request(url=url,callback=self.newparse)defnewparse(self,response):
item=EpubItem()
link = response.xpath('//a[contains(@href,"down?id=") and contains(@href,"p=6")]/@href').extract()
url='http://www.ixdzs.com/'+link[0]
x=url.split('=')list=x[0:len(x)-1]
newurl='='.join(list)+'='
nam=response.xpath('//h1[@itemprop="name"]/text()').extract_first()
item['name']=nam[:len(nam)-5]
item['down_url']=newurl
yield item
pipelines部分
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymongo
import os
import json
classEpubPipeline(object):defprocess_item(self, item, spider):return item
classMongoPipeline(object):def__init__(self,mongo_uri,mongo_db):
self.mongo_uri=mongo_uri
self.mongo_db=mongo_db
@classmethoddeffrom_crawler(cls,crawler):return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB'))defopen_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db=self.client[self.mongo_db]defprocess_item(self,item,spider):
name=item.__class__.__name__
self.db[name].insert(dict(item))return item
defclose_spider(self,spider):
self.client.close()classJsonPipeline(object):defprocess_item(self, item, spider):
base_dir = os.getcwd()
filename = base_dir +'/news.json'# 打开json文件,向里面以dumps的方式吸入数据# 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如# :“/xe15”withopen(filename,'a')as f:
line = json.dumps(dict(item), ensure_ascii=False)+'\n'
f.write(line)return item
settings部分
BOT_NAME ='epub'
SPIDER_MODULES =['epub.spiders']
NEWSPIDER_MODULE ='epub.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'epub (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY =False
ITEM_PIPELINES={# 'tutorial.pipelines.TextPipeline':300,'epub.pipelines.MongoPipeline':400,'epub.pipelines.JsonPipeline':350,}
MONGO_URI='localhost'
MONGO_DB='epubdownload'
items部分
import scrapy
classEpubItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()
name = scrapy.Field()
down_url=scrapy.Field()