在项目中,我们将利用Spark对阿里文学网站上的小说进行离线和在线分析。本文中用到的技术有Scrapy爬虫+ Kafka + MongoDB + Spark Streaming + Spark MLlib + Spark SQL.接下来我们将结合具体代码进行分析。
1.获取数据
在本项目中我们主要使用阿里文学上各种类型小说作为数据集来源。我们首先需要使用Scrapy对网站中的小说进行抓取,我们主要抓取的内容为:小说名称、作者、类型、简介。具体体现在网站上如下图所示:
在本项目中我们需要选定小说的类型,这是因为在小说分类中有多种小类,这不利于后期对数据的标注。
接下来我们使用Scrapy对数据进行抓取,对Scrapy不清楚的,可以参考Scrapy入门案例
首先定义封装数据的类型:item.py
import scrapy
class NovelItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#link = scrapy.Field()#URL
category = scrapy.Field()
bookname = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
数据通道:pipelines.py
import pymongo
from kafka import SimpleProducer
from kafka.client_async import KafkaClient
from scrapy.utils.serialize import ScrapyJSONEncoder
list = []
class NovelPipeline(object):
def process_item(self, item, spider):
global list
list = []
with open("E://xuanhuan.txt", 'a') as fp:
list.append(item['category'])
list.append(item['author'])
#list.append(item['link'])
list.append(item['bookname'])
if (item['content'] != []):
list.append(item['content'][0].strip())
else:
return
print("成功")
# print(bookname + "\t" + author + "\t" + category + "\t"+ content + "\t" + link + "\n")
fp.write(" ".join(list) + '\n')
class MongoDBPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB'),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
self.collection = self.db["aliwenxue"]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
if(item['content'] == []):
return item
else:
item['content'] = item['content'][0].strip()
self.collection.insert(dict(item))
print("插入成功")
return item
class KafkaPipeline(object):
#初始化配置Kafka
def __init__(self, producer, topic):
self.producer = producer
self.topic = topic
self.encoder = ScrapyJSONEncoder()
#处理和编码每条数据记录,并发送给Kafka
def process_item(self, item, spider):
global list
list = []
list.append(item['category'])
list.append(item['author'])
#list.append(item['link'])
list.append(item['bookname'])
if (item['content'] != []):
list.append(item['content'][0].strip())
else:
return
item = dict(list)
item['spider'] = spider.name
msg = self.encoder.encode(item)
self.producer.send_messages(self.topic, msg)
#初始化配置,并创建客户端和调用写入Kafka函数逻辑
@classmethod
def from_settings(cls, settings):
k_hosts = settings.get('KAFKA_HOSTS', ['192.168.177.11:9092','192.168.177.12:9092','192.168.177.13:9292'])
topic = settings.get('KAFKA_TOPIC', 'novel')
kafka = KafkaClient(k_hosts)
conn = SimpleProducer(kafka)
return cls(conn, topic)
爬虫主类:
import scrapy
from novel.items import NovelItem
categorys = []
i = 8
class SolveSpider(scrapy.Spider):
name = "solve"
allowed_domains = ["aliwx.com.cn"]
global categorys,i
categorys = ["都市小说", "玄幻小说", "仙侠小说", "灵异推理", "历史架空", "游戏竞技", "科幻小说", "武侠小说","古代言情"]
pages = [32, 32, 32, 32, 32, 11, 32, 27, 32]
start_urls = []
for j in range(1, pages[i] + 1):
start_urls.append("https://www.aliwx.com.cn/store?sz=0&fc="+categorys[i] + "&wd=0&tm=0&st=0&&page=" + str(j))
def parse(self, response):
global categorys,i
novles = response.xpath('//ul[@class="store-ul clear"]/li')
for each_novle in novles:
print(each_novle)
print("\n")
item = NovelItem()
item['bookname'] = each_novle.xpath('./a/h3/text()').extract()[0]
item['content'] = each_novle.xpath('./a/p/text()').extract()
item['author'] = each_novle.xpath('./p/a/span/text()').extract()[0]
item['category'] = categorys[i]
yield item
配置文件:setting.py
BOT_NAME = 'novel'
SPIDER_MODULES = ['novel.spiders']
NEWSPIDER_MODULE = 'novel.spiders'
ITEM_PIPELINES = {'novel.pipelines.MongoDBPipeline':100}
MONGO_URI = "192.168.177.13"
MONGO_DB = "novels"
MONGO_COLLECTION = "aliwenxue"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'novel (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# 不验证SSL证书
DOWNLOAD_HANDLERS_BASE = {
'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
}