一、实验环境
Hadoop2.7
Spark 3.0.0
Pycharm
二、实验内容
本实验将股票数据进行过滤处理然后使用socket源将初始数据发送到客户端通过structured streaming累计计算求股票平均值分别计算出MA10、MA40,然后将两种平均值进行比较判断买入卖出的情况。
1、初始数据展示
需要对数据进行过滤处理仅保留日期和最终股票值
2、发送股票数据的Socket源
# !/usr/bin/ env python3
import sys
import pandas as pd
import socket
def socket_service():
try:
# 生成socket对象
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind(('192.168.126.1', 9999))
s.listen(1)
except socket.error as msg:
print(msg)
sys.exit(1)
print('Waiting client connection...')
conn, addr = s.accept()
print('Accept new connection from {0}'.format(addr))
try:
file = 'dj30.csv'
dj30_df = pd.read_csv(file)
dj30_df.dropna(inplace=True)
dj30_df = dj30_df.loc[:, ['Long Date', 'Close']]
for index, row in dj30_df.iterrows():
line = str(index) + " " + str(row['Long Date']) + " " + str(row['Close']) + '\r\n'
print(line)
conn.send(line.encode('utf-8'))
except socket.error:
print('Error Occured.\n\nClient disconnected.\n')
s.close()
if __name__ == '__main__':
socket_service()
3、使用Structured Streaming构建实时数据处理系统
(1)定义方法format_data_string:处理并修改日期格式
(2)定义方法f:输出并存储MA10的数据
(3)连接socket源,获取socket源产生的数据
(4)调用窗口函数累加股票值然后再求均值进行重新赋值
首先对MA10的数据进行处理
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import window
import datetime
@udf(returnType=TimestampType())
def format_date_string(val):
lst = val.split("/")
month = lst[0]
day = lst[1]
year = lst[2]
if len(month) < 2:
month = '0' + month
if len(day) < 2:
day = '0' + day
if len(year) < 4:
if year >= '80':
year = '19' + year
else:
year = '20' + year
return datetime.datetime.strptime(year + '-' + month + '-' + day + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
@udf(returnType=StringType())
def get_date(val):
return str(val[1])[0:10]
def f(data, id):
print(id)
data.select(["dt", "avg_point_10"])\
.orderBy('dt', ascending=False)\
.show()
data.select(["dt", "avg_point_10"])\
.write.csv("file:///mycode/res1", header=True, mode='overwrite')
if __name__ == '__main__':
spark = SparkSession \
.builder \
.appName("StructuredStreamingDemo") \
.master("local[*]") \
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
lines = spark.readStream \
.format("socket") \
.option("host", "192.168.126.1") \
.option("port", 9999) \
.load()
split_DF = F.split(lines.value, " ")
add_id_column = split_DF.getItem(0)
add_date_column = split_DF.getItem(1)
add_points_column = split_DF.getItem(2)
merged_df = lines.withColumn("id", add_id_column) \
.withColumn("date", add_date_column) \
.withColumn("points", add_points_column) \
.drop("value")
merged_df = merged_df.withColumn("points", merged_df["points"]
.cast("float")).withColumn("date", format_date_string(merged_df["date"]))
merged_df2 = merged_df.withWatermark("date", "40 days")
merged_df2.groupBy(window("date", "10 days", "1 day"))\
.agg(F.avg("points").alias("avg_point_10"))\
.withColumn("dt", get_date(F.col('window')))\
.writeStream.foreachBatch(f)\
.outputMode("complete")\
.start()
spark.streams.awaitAnyTermination()
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import window
import datetime
@udf(returnType=TimestampType())
def format_date_string(val):
lst = val.split("/")
month = lst[0]
day = lst[1]
year = lst[2]
if len(month) < 2:
month = '0' + month
if len(day) < 2:
day = '0' + day
if len(year) < 4:
if year >= '80':
year = '19' + year
else:
year = '20' + year
return datetime.datetime.strptime(year + '-' + month + '-' + day + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
@udf(returnType=StringType())
def get_date(val):
return str(val[1])[0:10]
def f2(data, id):
print(id)
data.select(["dt", "avg_point_40"])\
.orderBy('dt', ascending=False).show(10)
data.select(["dt", "avg_point_40"])\
.write.csv("file:///mycode/res2", header=True, mode='overwrite')
if __name__ == '__main__':
spark = SparkSession \
.builder \
.appName("StructuredStreamingDemo") \
.master("local[*]") \
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
lines = spark.readStream \
.format("socket") \
.option("host", "192.168.126.1") \
.option("port", 9999) \
.load()
split_DF = F.split(lines.value, " ")
add_id_column = split_DF.getItem(0)
add_date_column = split_DF.getItem(1)
add_points_column = split_DF.getItem(2)
merged_df = lines.withColumn("id", add_id_column) \
.withColumn("date", add_date_column) \
.withColumn("points", add_points_column) \
.drop("value")
merged_df = merged_df.withColumn("points", merged_df["points"].cast("float")).withColumn("date", format_date_string(merged_df["date"]))
merged_df2 = merged_df.withWatermark("date", "40 days")
s = merged_df2.groupBy(window("date", "40 days", "1 day"))\
.agg(F.avg("points").alias("avg_point_40"))\
.withColumn("dt", get_date(F.col('window')))\
.writeStream.foreachBatch(f2)\
.outputMode("complete")\
.start()
spark.streams.awaitAnyTermination()
4、使用pandas和matplotlib实现买入/卖出信息的最终展示
(1)读取MA40的数据
(3)对比MA10和MA40的大小,并判断是买入还是卖出
(4)画出MA10和MA40的道琼斯指数图便于看清股票变化趋势
import os
import pandas as pd
import matplotlib.pyplot as plt
# 读取MA40的数据
df1 = None
dir_path = "./res2/"
for file in os.listdir(dir_path):
if file.startswith("part"):
tmp = pd.read_csv(dir_path + file)
if df1 is None:
df1 = tmp
else:
df1 = pd.concat([df1, tmp])
df1 = df1.loc[df1['dt'] <= '2016-03-24'].sort_values("dt")
# 读取MA10的数据
df2 = None
dir_path = "./res1/"
for file in os.listdir(dir_path):
if file.startswith("part"):
tmp = pd.read_csv(dir_path + file)
if df2 is None:
df2 = tmp
else:
df2 = pd.concat([df2, tmp])
df2 = df2.loc[df2['dt'] <= '2016-03-24'].sort_values("dt")
# 将MA10和MA40的数据做join
df = df1.merge(df2, on='dt')
# 画MA10和MA40的图像,并保存
plt.figure(figsize=(15, 5))
plt.plot(df['dt'], df['avg_point_40'], label='MA40')
plt.plot(df['dt'], df['avg_point_10'], label='MA10')
plt.xticks([df['dt'][i] for i in range(len(df['dt'].values)) if i % 1000 == 0 or i == len(df) - 1],
rotation=30)
plt.legend()
plt.xlabel("date")
plt.ylabel("avg_point")
plt.show()
# plt.savefig("./avg_point.png", dpi=300)
# 对比MA10和MA40的大小,并判断是买入还是卖出
with open("./result.txt", "w") as f:
for idx, item in df.iterrows():
dt = item['dt']
avg10 = item['avg_point_10']
avg40 = item['avg_point_40']
s = ""
if avg10 > avg40:
s = dt + " " + "买入"
else:
s = dt + " " + "卖出"
print(s)
f.write(s + "\n")
5、股票买入/卖出分析实现过程
(1)启动spark节点
(2)启动Socket服务器端产生数据源
(3)启动客户端进行数据处理所示
将MMA10.py文件放到虚拟机上进行MA10数据处理
客户端从Socket服务器端接收数据后,执行对股票平均值MA10的计算,然后输出如下所示
将MMA40.py文件放到虚拟机上进行MA40数据处理
客户端从Socket服务器端接收数据后,执行对股票平均值MA40的计算,然后输出如下所示
(4)股票平均值数据展示
MA10部分均值展示:
MA40部分均值展示:
(5)比较MA10和MA40的数据大小判断买入卖出的对应日期,在本地端数据处理后输出如下所示:
琼斯指数图如下所示:
反映了从1990年到2016年股票的变化趋势
本次实验中成功使用Socket源发送数据给客户端,然后客户端处理数据,实现了股票买入卖出的分析,并展示了股票的变化趋势。