首先展现最终实现的效果:
首先是建立scrapy项目:
scrapy startproject novelcrawl #我的项目名为novelcrawl
用pycharm打开项目:
这是我的items.py文件:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NovelcrawlItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
status = scrapy.Field()
count = scrapy.Field()
kind = scrapy.Field()
NovelCrawls.py文件:
# encoding:utf-8
from scrapy.spiders import CrawlSpider
import scrapy
import re
from bs4 import BeautifulSoup
from NovelCrawl.items import NovelcrawlItem
class NovelCrawl(CrawlSpider):
name = "novelcrawl"
allowed_domains = ['https://www.hongxiu.com']
def start_requests(self):
start_urls = []
for i in range(1, 21):
url = ("https: // www.hongxiu.com/all?pageSize = 10 & gender = 1 & catId = -1 & isFinish = -1 & isVip = -1 & size = -1 & updT = -1 & orderBy = 0 & pageNum = %d"%i).replace(' ', '')
url = scrapy.Request(url)
start_urls.append(url)
return start_urls
def parse(self, response):
print "start crawling"
item = NovelcrawlItem()
soup = BeautifulSoup(response.body, 'html.parser', from_encoding='utf-8')
item['title'] = soup.find_all('img', src=re.compile('//qidian.qpic.cn/qdbimg/\d+/c_\d+/\d+'))
item['author'] = soup.find_all('a', class_='default')
item['kind'] = soup.find_all('span', class_='org')
item['status'] = soup.find_all('span', class_='pink')
item['count'] = soup.find_all('span', class_='blue')
yield item
关于上面的BeautifulSoup为什么这么写,请参考我的这篇 实战爬虫-爬取红袖添香并存入数据库
下面是我的pipelines.py文件(用来处理爬取到的数据):
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class NovelcrawlPipeline(object):
def __init__(self):
self.file = open('novel.csv', 'w+')#创建一个可写文件,保存爬取的数据
def process_item(self, item, spider):
titles = item['title']
authors = item['author']
statuss = item['status']
counts = item['count']
kinds = item['kind']
for title, author, status, count, kind in zip(titles, authors, statuss, counts, kinds):
self.file.write(title['alt'].encode('utf-8')+'\t'+author.get_text().encode('utf-8')+'\t'+status.get_text().encode('utf-8')+'\t'+count.get_text().encode('utf-8')+'\t'+kind.get_text().encode('utf-8')+'\n')
return item
取消settings.py文件中对管道文件的注释:
ITEM_PIPELINES = {
'NovelCrawl.pipelines.NovelcrawlPipeline': 300,
}
最终得到一个.csv文件,打开得到最上面的结果。