from pyspark.sql import SparkSession, Row from matplotlib import pyplot as plt spark = SparkSession.builder.master("local[4]").appName("spark_01").getOrCreate() sc = spark.sparkContext lines = sc.textFile("/Users/f7689781/Downloads/winequality/winequality-white.csv") first_ = lines.first() filter_rdd = lines.filter(lambda r: r != first_) map_rdd = filter_rdd.map(lambda r: r.split(";")) row_rdd = map_rdd.map( lambda r: Row(fixedAcidity=r[0], volatileAcidity=r[1], citricAcid=r[2], residualSugar=r[3], chlorides=r[4], freeSulfurDioxide=r[5], totalSulfurDioxide=r[6], density=r[7], pH=r[8], sulphates=r[9], alcohol=r[10], quality=r[11])) data_df = spark.createDataFrame(row_rdd) data_df.show() data_df.createOrReplaceTempView("white_wine") quality_df = spark.sql("select quality from white_wine") quality = quality_df.rdd.map(lambda r: r.quality).collect() plt.plot(quality) plt.show() spark.stop()
python、spark整合
最新推荐文章于 2024-03-26 14:51:46 发布