基于Spark Streaming预测股票走势的例子（一）-CSDN博客

　　最近学习Spark Streaming，不知道是不是我搜索的姿势不对，总找不到具体的、完整的例子，一怒之下就决定自己写一个出来。下面以预测股票走势为例，总结了用Spark Streaming开发的具体步骤以及方法。

　　一、数据源。

　　既然预测股票走势，当然要从网上找一下股票数据的接口，具体可以参考 http://blog.sina.com.cn/s/blog_540f22560100ba2k.html、http://apistore.baidu.com/apiworks/servicedetail/115.html 。下面简单分析一下各种数据接口的优劣以抛砖引玉：

　　1、Sina股票数据接口。以字符串数据的形式范围，简单易用且直观。

　　2、百度数据接口。以API集市形式提供json形式的数据，比较规范，但使用起来比较繁琐。

　　简单起见，作者使用新浪的数据接口。

　　二、测试数据源

　　有了股票的数据接口，以下代码提供简单的测试，以解析返回的数据。

/**
  * Created by gabry.wu on 2016/2/18.
  */
package com.gabry.stock

import scala.io.Source
/** 其实这个类应该更通用一点，但目前一切以简单为主，后期在进行重构 **/
class SinaStock
{
  var code:String="" //“sh601006”，股票代码
  var name :String =""  //”大秦铁路”，股票名字
  var curOpenPrice :Float =0 //”27.55″，今日开盘价
  var lstOpenPrice:Float =0 //”27.25″，昨日收盘价
  var curPrice :Float =0 //”26.91″，当前价格
  var highestPrice  :Float =0 //”27.55″，今日最高价
  var lowestPrice :Float=0 //”26.20″，今日最低价
  var bidBuyPrice:Float=0 //”26.91″，竞买价，即“买一”报价
  var bidSalePrice:Float=0 //”26.92″，竞卖价，即“卖一”报价
  var dealNum :Long=0 //8：”22114263″，成交的股票数，由于股票交易以一百股为基本单位，所以在使用时，通常把该值除以一百
  var dealAmount  :Float=0 //9：”589824680″，成交金额，单位为“元”，为了一目了然，通常以“万元”为成交金额的单位，所以通常把该值除以一万
  var bidBuy1Num :Long=0 //10：”4695″，“买一”申请4695股，即47手
  var bidBuy1Amount :Float=0 //11：”26.91″，“买一”报价
  var bidBuy2Num :Long=0
  var bidBuy2Amount :Float=0
  var bidBuy3Num :Long=0
  var bidBuy3Amount :Float=0
  var bidBuy4Num :Long=0
  var bidBuy4Amount :Float=0
  var bidBuy5Num :Long=0
  var bidBuy5Amount :Float=0
  var bidSale1Num :Long=0 //“卖一”申报3100股，即31手
  var bidSale1Amount :Float=0 //“卖一”报价
  var bidSale2Num :Long=0
  var bidSale2Amount :Float=0
  var bidSale3Num :Long=0
  var bidSale3Amount :Float=0
  var bidSale4Num :Long=0
  var bidSale4Amount :Float=0
  var bidSale5Num :Long=0
  var bidSale5Amount :Float=0
  var date:String ="" //”2008-01-11″，日期
  var time:String="" //”15:05:32″，时间
  def toDebugString =  "code[%s],name[%s],curOpenPrice [%f],lstOpenPrice[%f],curPrice [%f],highestPrice  [%f],lowestPrice [%f],bidBuyPrice[%f],bidSalePrice[%f],dealNum [%d],dealAmount  [%f],bidBuy1Num [%d],bidBuy1Amount [%f],,bidBuy2Num [%d],bidBuy2Amount [%f],bidBuy3Num [%d],bidBuy3Amount [%f],bidBuy4Num [%d],bidBuy4Amount [%f],bidBuy5Num [%d],bidBuy5Amount [%f],bidSale1Num [%d],bidSale1Amount [%f],bidSale2Num [%d],bidSale2Amount [%f],bidSale3Num [%d],bidSale3Amount [%f],bidSale4Num [%d],bidSale4Amount [%f],bidSale5Num [%d],bidSale5Amount [%f],date [%s],time [%s]" .format( this.code,    this.name,    this.curOpenPrice ,    this.lstOpenPrice,    this.curPrice ,    this.highestPrice  ,    this.lowestPrice ,    this.bidBuyPrice,    this.bidSalePrice,    this.dealNum ,    this.dealAmount  ,    this.bidBuy1Num ,    this.bidBuy1Amount ,    this.bidBuy2Num ,    this.bidBuy2Amount ,    this.bidBuy3Num ,    this.bidBuy3Amount ,    this.bidBuy4Num ,    this.bidBuy4Amount ,    this.bidBuy5Num ,    this.bidBuy5Amount ,    this.bidSale1Num ,    this.bidSale1Amount ,    this.bidSale2Num ,    this.bidSale2Amount ,    this.bidSale3Num ,    this.bidSale3Amount ,    this.bidSale4Num ,    this.bidSale4Amount ,    this.bidSale5Num ,    this.bidSale5Amount ,    this.date ,    this.time  )
  override def toString =  Array(this.code,this.name,this.curOpenPrice,this.lstOpenPrice,this.curPrice,this.highestPrice,this.lowestPrice,this.bidBuyPrice,this.bidSalePrice,this.dealNum,this.dealAmount,this.bidBuy1Num,this.bidBuy1Amount,this.bidBuy2Num,this.bidBuy2Amount,this.bidBuy3Num,this.bidBuy3Amount,this.bidBuy4Num,this.bidBuy4Amount,this.bidBuy5Num,this.bidBuy5Amount,this.bidSale1Num,this.bidSale1Amount,this.bidSale2Num,this.bidSale2Amount,this.bidSale3Num,this.bidSale3Amount,this.bidSale4Num,this.bidSale4Amount,this.bidSale5Num,this.bidSale5Amount,this.date,this.time).mkString(",")
  private var stockInfo :String =""
  def getStockInfo = stockInfo
  def this(stockInfo:String)
  {
    this()
    this.stockInfo=stockInfo
/** 根据新浪的数据接口解析数据 **/
    val stockDetail=stockInfo.split(Array(' ','_','=',',','"'))
    if (stockDetail.length>36){
      this.code=stockDetail(3)
      this.name=stockDetail(5)
      this.curOpenPrice =stockDetail(6).toFloat
      this.lstOpenPrice=stockDetail(7).toFloat
      this.curPrice =stockDetail(8).toFloat
      this.highestPrice  =stockDetail(9).toFloat
      this.lowestPrice =stockDetail(10).toFloat
      this.bidBuyPrice=stockDetail(11).toFloat
      this.bidSalePrice=stockDetail(12).toFloat
      this.dealNum =stockDetail(13).toLong
      this.dealAmount  =stockDetail(14).toFloat
      this.bidBuy1Num =stockDetail(15).toLong
      this.bidBuy1Amount =stockDetail(16).toFloat
      this.bidBuy2Num =stockDetail(17).toLong
      this.bidBuy2Amount =stockDetail(18).toFloat
      this.bidBuy3Num =stockDetail(19).toLong
      this.bidBuy3Amount =stockDetail(20).toFloat
      this.bidBuy4Num =stockDetail(21).toLong
      this.bidBuy4Amount =stockDetail(22).toFloat
      this.bidBuy5Num =stockDetail(23).toLong
      this.bidBuy5Amount =stockDetail(24).toFloat
      this.bidSale1Num =stockDetail(25).toLong
      this.bidSale1Amount =stockDetail(26).toFloat
      this.bidSale2Num =stockDetail(27).toLong
      this.bidSale2Amount =stockDetail(28).toFloat
      this.bidSale3Num =stockDetail(29).toLong
      this.bidSale3Amount =stockDetail(30).toFloat
      this.bidSale4Num =stockDetail(31).toLong
      this.bidSale4Amount =stockDetail(32).toFloat
      this.bidSale5Num =stockDetail(33).toLong
      this.bidSale5Amount =stockDetail(34).toFloat
      this.date =stockDetail(35)
      this.time =stockDetail(36)
      }
  }
}
/** SinaStock的伴生对象，此处用来替代new **/
object SinaStock
{
  def apply(stockInfo:String) :SinaStock =
  {
    new SinaStock(stockInfo)
  }
}
object StockRetrivor {
  def main(args: Array[String]): Unit = {
    println("查询新浪股票（每小时更新） http://hq.sinajs.cn/list=sh601006,sh601007")
/** 查询sh601006,sh601007两只股票 **/
    val sinaStockStream = Source.fromURL("http://hq.sinajs.cn/list=sh601006,sh601007","gbk")
    val sinaLines=sinaStockStream.getLines
    for(line <- sinaLines) {
/** 将每行数据解析成SinaStock对象，并答应对应的股票信息 **/
      println(SinaStock(line).toString)
    }
    sinaStockStream.close()
  }
}

　　三、Spark Streaming编程

　　数据接口调试完毕，股票数据也解析好了，下面就开始Streaming。Spark Streaming一定会涉及数据源，且该数据源是一个主动推送的过程，即spark被动接受该数据源的数据进行分析。但Sina的接口是一个很简单的HttpResponse，无法主动推送数据，所以我们需要实现一个Custom Receiver，可参考 http://spark.apache.org/docs/latest/streaming-custom-receivers.html

　　下面是具体的代码，其实定制化一个Receiver简单来说就是实现onStart/onStop。onStart用来初始化资源，给获取数据做准备，获取到的数据用store发送给SparkStreaming即可；onStop用来释放资源

package com.gabry.stock

import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver

import scala.io.Source

/**
  * Created by gabry.wu on 2016/2/19.
  * 简单起见，只获取新浪股票数据，后续再进行重构
  */
class SinaStockReceiver extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging{
  def onStart() {
    /* 创建一个线程用来查询新浪股票数据，并将数据发送给Spark Streaming */
    new Thread("Socket Receiver") {
      override def run() { receive() }
    }.start()
  }

  def onStop() {
    // There is nothing much to do as the thread calling receive()
    // is designed to stop by itself isStopped() returns false
  }
  private def receive(): Unit = {
      try{
        while(!isStopped ) {
          var stockIndex = 1
          while(stockIndex!=0){
            val stockCode = 601000+stockIndex
            val url="http://hq.sinajs.cn/list=sh%d".format(stockCode)
            logInfo(url)
            val sinaStockStream = Source.fromURL(url,"gbk")
            val sinaLines=sinaStockStream.getLines
            for(line <- sinaLines) {
              logInfo(line)
              store(line)
            }
            sinaStockStream.close()
            stockIndex= (stockIndex+1)%1
          }
       
        }

        logInfo("Stopped receiving")
        restart("Trying to connect again")
      } catch {
        case e: java.net.ConnectException =>
          restart("Error connecting to", e)
        case t: Throwable =>
          restart("Error receiving data", t)
      }
    }
}

　　Receiver搞定之后就可以开始编写股票预测的main函数了，贴代码之前说明一下，股票预测的方法之一，就是统计一段时间内股票上涨的次数，并展示上涨次数TopN的股票信息，但本文一切从简，并没有实现全部的功能，只是统计了股票上涨的次数，也就是对上涨与否进行WordCount。

/**
  * Created by gabry.wu on 2016/2/19.
  */
package com.gabry.stock

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StockTrend {
  def updatePriceTrend( newValue:Seq[(Float,Int)],preValue :Option[(Float,Int)]):Option[(Float,Int)] = {
    if (newValue.length>0){
      val priceDiff=newValue(0)._1 - preValue.getOrElse((newValue(0)._1 ,0))._1
      // ("update state: new Value "+newValue(0) + ",pre Value " + preValue.getOrElse((newValue(0)._1 ,0)))
      Some((newValue(0)._1,priceDiff.compareTo(0.0f)))
    }else preValue
  }

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("CustomReceiver").setMaster("local[4]")
    val ssc = new StreamingContext(sparkConf, Seconds(1))
    Logger.getRootLogger.setLevel(Level.WARN)
    ssc.checkpoint("./tmp")
    /* 创建股票的输入流，该输入流是自定义的 */
    val lines = ssc.receiverStream(new SinaStockReceiver())
    /** 将数据的每一行映射成一个SinaStock对象。注意此处的每一行数据都是SinaStockReceiver对象调用store传过来的 **/
    val words = lines.map(SinaStock(_))
    import scala.util.Random
    /* reduce从左到右进行折叠。其实就是先处理t-6，t-5的RDD，将结果与t-4的RDD再次调用reduceFunc，依次类推直到当前RDD */
    def reduceFunc( left :(Float,Int),right:(Float,Int)):(Float,Int) = {
      println("left "+left+"right "+right)
      (right._1,left._2+right._2)
    }

    /* 3点之后股票价格不在变化，故为了测试，此处使用随机数修改股票当前价格 */
    /* 根据上一次股票价格更新股票的变化方向 */
    /** 由于股票信息只有当前价格，如果要判断股票上涨与否就要记录上一次的股票价格，所以此处使用updateStateByKey更新当前股票价格是否上涨。
　　　　若上涨则记为1，不变记为0，否则记为1
      **/
    val stockState = words.map(sinaStock => (sinaStock.name, (sinaStock.curPrice+Random.nextFloat,-1))).filter(stock=>stock._1.isEmpty==false)
                   .updateStateByKey(updatePriceTrend)
    /* 每3秒，处理过去6秒的数据，对数据进行变化的累加 */
    val stockTrend=stockState.reduceByKeyAndWindow(reduceFunc(_,_),Seconds(6),Seconds(3))
    /* 每3秒，处理过去6秒的数据，对数据进行正向变化的累加 */
    //val stockPosTrend=stockState.filter(x=>x._2._2>=0).reduceByKeyAndWindow(reduceFunc(_,_),Seconds(6),Seconds(3))
    stockState.print()
    stockTrend.print()
    //stockPosTrend.print()
    ssc.start()
    ssc.awaitTermination()
    println("StockTrend")
  }
}

　　四、运行结果分析

　　下面是某次运行的打印结果，对其进行简单的分析。

　　由于ssc的时间间隔为1，所以每秒都会查询大同煤业的股票数据，这就是下面每个Time打印的第一行数据（因为stockState先进行print，所以每次查询的股票数据是第一行）；又因为slide设置为3，所以每隔3秒会进行reduceFunc计算，该函数处理windowsize个RDD（此处设置为6），对这6个RDD按照时间先后顺序进行reduce。

　　需要特别说明的是spark的reduce默认从左到右进行fold（折叠），从最左边取两个数进行reduce计算产生临时结果，再与后面的数据进行reduce，以此类推进行计算，其实就是foldLeft。

　　下面标红色的数据，其实就是对(5.387682,0),(5.9087195,1),(5.7605586,-1),(5.278526,-1),(5.4471517,1),(5.749305,1)进行reduce的过程。

-------------------------------------------

Time: 1455888254000 ms

-------------------------------------------

(大同煤业,(5.387682,0))

-------------------------------------------

Time: 1455888255000 ms

-------------------------------------------

(大同煤业,(5.9087195,1))

-------------------------------------------

Time: 1455888256000 ms

-------------------------------------------

(大同煤业,(5.7605586,-1))

left (5.387682,0)right (5.9087195,1)

left (5.9087195,1)right (5.7605586,-1)

-------------------------------------------

Time: 1455888256000 ms

-------------------------------------------

(大同煤业,(5.7605586,0))

-------------------------------------------

Time: 1455888257000 ms

-------------------------------------------

(大同煤业,(5.278526,-1))

-------------------------------------------

Time: 1455888258000 ms

-------------------------------------------

(大同煤业,(5.4471517,1))

-------------------------------------------

Time: 1455888259000 ms

-------------------------------------------

(大同煤业,(5.749305,1))

left (5.387682,0)right (5.9087195,1)

left (5.9087195,1)right (5.7605586,-1)

left (5.7605586,0)right (5.278526,-1)

left (5.278526,-1)right (5.4471517,1)

left (5.4471517,0)right (5.749305,1)

-------------------------------------------

Time: 1455888259000 ms

-------------------------------------------

(大同煤业,(5.749305,1))

-------------------------------------------

Time: 1455888260000 ms

-------------------------------------------

(大同煤业,(5.749305,1))

-------------------------------------------

Time: 1455888261000 ms

-------------------------------------------

(大同煤业,(5.748391,-1))

-------------------------------------------

Time: 1455888262000 ms

-------------------------------------------

(大同煤业,(5.395269,-1))

left (5.278526,-1)right (5.4471517,1)

left (5.4471517,0)right (5.749305,1)

left (5.749305,1)right (5.749305,1)

left (5.749305,2)right (5.748391,-1)

left (5.748391,1)right (5.395269,-1)

-------------------------------------------

Time: 1455888262000 ms

-------------------------------------------

(大同煤业,(5.395269,0))

-------------------------------------------

Time: 1455888263000 ms

-------------------------------------------

(大同煤业,(5.5215807,1))

-------------------------------------------

Time: 1455888264000 ms

-------------------------------------------

(大同煤业,(5.945005,1))

-------------------------------------------

Time: 1455888265000 ms

-------------------------------------------

(大同煤业,(5.2400274,-1))

left (5.749305,1)right (5.748391,-1)

left (5.748391,0)right (5.395269,-1)

left (5.395269,-1)right (5.5215807,1)

left (5.5215807,0)right (5.945005,1)

left (5.945005,1)right (5.2400274,-1)

-------------------------------------------

Time: 1455888265000 ms

-------------------------------------------

(大同煤业,(5.2400274,0))

-------------------------------------------

Time: 1455888266000 ms

-------------------------------------------

(大同煤业,(5.1895638,-1))

-------------------------------------------

Time: 1455888267000 ms

-------------------------------------------

(大同煤业,(5.1885605,-1))

-------------------------------------------

Time: 1455888268000 ms

-------------------------------------------

(大同煤业,(5.9881735,1))

Process finished with exit code -1

　　五、总结

　　本文以股票预测为例简单描述了SparkStreaming编程的步骤及其注意点，希望抛砖引玉，也算弥补了网上没有完整例子的遗憾。但由于作者重代码、轻描述，估计会有一些不易理解的地方，还望各位读者留言讨论。最后附上源码的git地址：http://git.oschina.net/gabry_wu/BigDataPractice

PS：未经允许，禁止转载，否则将追究法律责任！