scrapy spider文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import sys
reload(sys)
sys.setdefaultencoding('utf8')#设置默认编码为utf8
class SplashSpider(scrapy.Spider):
name = 'hkxs'#爬虫名字
allowed_domains = ['hkxs.net']
start_urls = [
'http://www.hkxs.net/',
]
# 自定义配置
custom_settings = {
'ITEM_PIPELINES': {
'mininova.pipelines.HkxsPipeline': 300,
}
}
def start_requests(self): #重新定义起始爬取点
for url in self.start_urls:
yield Request(url)
#定义默认的页面解析函数
def parse(self, response):
#所有分类的名字
cates_text = response.xpath('//div[@class="nav"]/ul/li//a/text()').extract()
#所有分类的url
cates_href = response.xpath('//div[@class="nav"]/ul/li//a/@href').extract()
for url in cates_href:
#如果链接不为javascript,请求url,在回调函数中获取对应的分类书籍
if url != 'javascript:showbook1();':
yield Request(response.urljoin(url),callback=self.parse_cate)
#获取分类
def parse_cate(self,response):
url = response.url
print('cate_url')
print(url)
#分类名
cate = response.xpath('//title/text()').extract()[0]
#获取第一部分的书籍
book_count = len(response.xpath('//div[@class="l"]/ul/li'))
print('count1:')
print(book_count)
for book_index in range(book_count):
book = {'bkname':'','last_chapter':'','last_chapter_date':'','author':'','book_url':''}
book['bkname'] = response.xpath('//div[@class="l"]/ul/li['+str(book_index+1)+']/span[1]/a/text()').extract()[0].encode('utf-8')
book['book_url'] = response.xpath('//div[@class="l"]/ul/li['+str(book_index+1)+']/span[1]/a/@href').extract()[0].encode('utf-8')
book['last_chapter'] = response.xpath('//div[@class="l"]/ul/li['+str(book_index+1)+']/span[2]/a/text()').extract()[0].encode('utf-8')
book['last_chapter_date'] = response.xpath('//div[@class="l"]/ul/li['+str(book_index+1)+']/span[2]/text()').extract()[0].encode('utf-8')
book['author'] = response.xpath('//div[@class="l"]/ul/li['+str(book_index+1)+']/span[3]/text()').extract()[0].encode('utf-8')
print(book['bkname'])
# yield SplashRequest(response.urljoin(book['book_url']),meta={'source':'hkxs'},callback=self.parse_book)
yield Request(response.urljoin(book['book_url']),meta={'source':'hkxs'},callback=self.parse_book)
#获取第二部分的书籍
book_count = len(response.xpath('//div[@class="r"]/ul/li'))
print('count2:')
print(book_count)
for book_index in range(book_count):
book = {'bkname':'','last_chapter':'','last_chapter_date':'','author':'','book_url':''}
book['bkname'] = response.xpath('//div[@class="r"]/ul/li['+str(book_index+1)+']/span[1]/a/text()').extract()[0].encode('utf-8')
book['book_url'] = response.xpath('//div[@class="r"]/ul/li['+str(book_index+1)+']/span[1]/a/@href').extract()[0].encode('utf-8')
book['author'] = response.xpath('//div[@class="r"]/ul/li['+str(book_index+1)+']/span[2]/text()').extract()[0].encode('utf-8')
print(book['bkname'])
# yield SplashRequest(response.urljoin(book['book_url']),meta={'source':'hkxs'},callback=self.parse_book)
yield Request(response.urljoin(book['book_url']),meta={'source':'hkxs'},callback=self.parse_book)
print('cate'+cate+'over')
#定义获取书籍的章节的函数
def parse_book(self,response):
url = response.url
print('book_url')
print(url)
source = response.meta['source']
#书名
book_name = response.xpath('//div[@id="info"]/h1/text()').extract()[0].encode('utf-8')
#作者
author = response.xpath('//div[@id="info"]/p[1]/text()').extract()[0].encode('utf-8')
#最新章节
last_chapter_title = response.xpath('//div[@id="info"]/p[4]/a/text()').extract()[0].encode('utf-8')
#最新章节更新日期
last_chapter_date = response.xpath('//div[@id="info"]/p[3]/text()').extract()[0].encode('utf-8')
#书籍介绍
book_intro = response.xpath('//div[@id="intro"]/p[1]/text()').extract()
#章节列表
book_chapters = response.xpath('//div[@id="list"]//dt[2]//following::dd//a/text()').extract()
#章节列表url
book_chapter_urls = response.xpath('//div[@id="list"]//dt[2]//following::dd//a/@href').extract()
#书籍封面
book_cover = response.xpath('//*[@id="fmimg"]/img/@src').extract()[0]
for chapter_index,book_chapter_url in enumerate(book_chapter_urls):
yield Request( response.urljoin(book_chapter_url),meta={'sort':chapter_index+1,'book_cover':book_cover,'source':source,'author':author,'last_chapter_date':last_chapter_date,'last_chapter_title':last_chapter_title,'book_intro':book_intro},callback=self.parse_chapter)
print('book:'+book_name+' over')
#定义获取章节的函数
def parse_chapter(self,response):
url = response.url
print('chapter_url')
print(url)
#打印从书籍函数传过来的章节序号
print(response.meta['sort'])
#从书籍函数传过来的作者
author = response.meta['author']
#从书籍函数传过来的来源站
source = response.meta['source']
#从书籍函数传过来书籍封面
book_cover = response.meta['book_cover']
#从书籍函数传过来的最新章节日期
last_chapter_date = response.meta['last_chapter_date']
#从书籍函数传过来的最新章节
last_chapter_title = response.meta['last_chapter_title']
#从书籍函数传过来的书籍介绍
book_intro = response.meta['book_intro']
#书籍分类
cate = response.xpath('//div[@class="con_top"]//a[2]/text()').extract()[0].encode('utf-8')
#书籍名
book_name = response.xpath('//div[@class="con_top"]//a[3]/text()').extract()[0].encode('utf-8')
#章节名
chapter_title = response.xpath('//div[@class="bookname"]/h1/text()').extract()[0].encode('utf-8')
#章节序号(从书籍函数传过来的)
chapter_sort = response.meta['sort']
#章节内容(废弃,存储chapter_url)
chapter_contents = response.xpath('//div[@id="content"]/text()').extract()
#不使用item文件,直接传pipeline mongodb需要的字段
chapter = {'book_cover':book_cover,'cate':cate,'book_name':book_name,'chapter_title':chapter_title,'chapter_sort':chapter_sort,'chapter_url':url,'author':author,'last_chapter_date':last_chapter_date,'source':source,'last_chapter_title':last_chapter_title,'book_intro':book_intro}
return chapter
mongo_novel.py 在mongodb中存储的py文件
from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():
db_name = 'car'
brand_set_name = 'brand'
brand_item_set_name = 'brand_item'
car_set_name = 'car'
def __init__(self):
self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])
def insert(self,item):
brand_where = {'name':item['brand']}
brand = self.brand_exist(self.db,brand_where)
if brand == False:
brand = {'name':item['brand'],'first_word':item['first_word']}
brand = self.insert_brand(self.db,brand)
print('brand insert ok!')
else:
brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}
brand = self.update_brand(self.db,brand_where,brand)
print('brand_exist!')
brand_item_where = {'name':item['brand_item']}
brand_item = self.brand_item_exist(self.db,brand_item_where)
if brand_item == False:
brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}
brand_item = self.insert_brand_item(self.db,brand_item)
print('brand_item insert ok!')
else:
print('brand_item_exist!')
car_where = {'name':item['brand_item'],'name':item['name']}
car = self.car_exist(self.db,car_where)
if car == False:
car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}
car = self.insert_car(self.db,car)
print('car insert ok!')
else:
print('car_exist!')
if car != False:
return True;
else:
return False;
def update_brand(self,db,brand_where,brand):
my_set = db.set(self.db_name,self.brand_set_name)
my_set.update_one(brand_where,{'$set':brand})
exist = my_set.find_one(brand_where)
if(exist is None):
return False
else:
return exist
def brand_exist(self,db,brand):
my_set = db.set(self.db_name,self.brand_set_name)
exist = my_set.find_one(brand)
if(exist is None):
return False
else:
return exist
def insert_brand(self,db,brand):
my_set = db.set(self.db_name,self.brand_set_name)
my_set.insert_one(brand)
brand = my_set.find_one(brand)
return brand
def brand_item_exist(self,db,brand_item):
my_set = db.set(self.db_name,self.brand_item_set_name)
exist = my_set.find_one(brand_item)
if(exist is None):
return False
else:
return exist
def insert_brand_item(self,db,brand_item):
my_set = db.set(self.db_name,self.brand_item_set_name)
my_set.insert_one(brand_item)
brand = my_set.find_one(brand_item)
return brand
def car_exist(self,db,car):
my_set = db.set(self.db_name,self.car_set_name)
exist = my_set.find_one(car)
if(exist is None):
return False
else:
return exist
def insert_car(self,db,car):
my_set = db.set(self.db_name,self.car_set_name)
my_set.insert_one(car)
brand = my_set.find_one(car)
return brand
pipeline
from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
import copy
class HkxsPipeline(object):
#http://www.hkxs.net
def process_item(self,item,spider):
mongo_novel = MongoNovel()
mongo_novel.insert(item)
print(item['book_name'])
print(item['chapter_title'])
print('item insert ok!')
setting 相关的设置
mongo_setting = {
'mongo_host' : 'xxx.xxx.xxx.xxx',
'mongo_port' : 27017,
'mongo_user' : 'username',
'mongo_password' : 'password'
}