from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row, SQLContext
from pyspark.sql.functions import udf, col, concat, concat_ws, explode, split
from pyspark.sql.types import MapType, IntegerType, StringType, FloatType, ArrayType
from pyspark.ml.feature import VectorAssembler
from itertools import chain
from pyspark.ml.linalg import Vectors, VectorUDT
import json
spark = SparkSession.builder.appName('test1') \
.master('local') \
.config('spark.executor.memory','4g') \
.config('spark.executor.cores', '4') \
.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
构建示例数据
test_score = spark.createDataFrame([['王强', '{"语文":119,"数学":125,"外语":120,"物理":96,"化学":70,"生物":40}'],
['刘明', '{"语文":105,"数学":121,"外语":110,"物理":71,"化学":74,"生物":45}'],
['张磊', '{"语文":110,"数学":103,"外语":124,"物理":86,"化学":52,"生物":55}']],['name','score'])
test_score.show(5,truncate=False,vertical=False)
test_score.printSchema()
+----+----------------------------------------------------------------+
|name|score |
+----+----------------------------------------------------------------+
|王强|{"语文":119,"数学":125,"外语":120,"物理":96,"化学":70,"生物":40}|
|刘明|{"语文":105,"数学":121,"外语":110,"物理":71,"化学":74,"生物":45}|
|张磊|{"语文":110,"数学":103,"外语":124,"物理":86,"化学":52,"生物":55}|
+----+----------------------------------------------------------------+
root
|-- name: string (nullable = true)
|-- score: string (nullable = true)
将数据中 score
列转换成映射
def udf_array_to_map(array):
if array is None:
return array
return dict((i, j) for i,j in json.loads(array).items())
score_udf = udf(lambda x: udf_array_to_map(x), MapType(StringType(), IntegerType(), True))
test_score_map = test_score.withColumn('score_map', score_udf(col('score'))).select('name','score_map')
test_score_map.show(truncate=False,vertical=False)
test_score_map.printSchema()
+----+---------------------------------------------------------------------------+
|name|score_map |
+----+---------------------------------------------------------------------------+
|王强|[物理 -> 96, 生物 -> 40, 化学 -> 70, 数学 -> 125, 语文 -> 119, 外语 -> 120]|
|刘明|[物理 -> 71, 生物 -> 45, 化学 -> 74, 数学 -> 121, 语文 -> 105, 外语 -> 110]|
|张磊|[物理 -> 86, 生物 -> 55, 化学 -> 52, 数学 -> 103, 语文 -> 110, 外语 -> 124]|
+----+---------------------------------------------------------------------------+
root
|-- name: string (nullable = true)
|-- score_map: map (nullable = true)
| |-- key: string
| |-- value: integer (valueContainsNull = true)
利用 explode
函数将 score_map
列中每行分成多行
test_score_explode = test_score_map.select('name', explode('score_map').alias('subject', 'score'))
test_score_explode .show()
test_score_explode.printSchema()
+----+-------+-----+
|name|subject|score|
+----+-------+-----+
|王强| 物理| 96|
|王强| 生物| 40|
|王强| 化学| 70|
|王强| 数学| 125|
|王强| 语文| 119|
|王强| 外语| 120|
|刘明| 物理| 71|
|刘明| 生物| 45|
|刘明| 化学| 74|
|刘明| 数学| 121|
|刘明| 语文| 105|
|刘明| 外语| 110|
|张磊| 物理| 86|
|张磊| 生物| 55|
|张磊| 化学| 52|
|张磊| 数学| 103|
|张磊| 语文| 110|
|张磊| 外语| 124|
+----+-------+-----+
root
|-- name: string (nullable = true)
|-- subject: string (nullable = false)
|-- score: integer (nullable = true)
将 score_map
中的数据拆分成多个列
test_score_unfold = test_score_map.withColumn('语文', test_score_map['score_map'].getItem('语文'))\
.withColumn('数学', test_score_map['score_map'].getItem('数学'))\
.withColumn('外语', test_score_map['score_map'].getItem('外语'))\
.withColumn('物理', test_score_map['score_map'].getItem('物理'))\
.withColumn('化学', test_score_map['score_map'].getItem('化学'))\
.withColumn('生物', test_score_map['score_map'].getItem('生物'))\
.select('name','语文','数学','外语','物理','化学','生物')
test_score_unfold.show()
test_score_unfold.printSchema()
+----+----+----+----+----+----+----+
|name|语文|数学|外语|物理|化学|生物|
+----+----+----+----+----+----+----+
|王强| 119| 125| 120| 96| 70| 40|
|刘明| 105| 121| 110| 71| 74| 45|
|张磊| 110| 103| 124| 86| 52| 55|
+----+----+----+----+----+----+----+
root
|-- name: string (nullable = true)
|-- 语文: integer (nullable = true)
|-- 数学: integer (nullable = true)
|-- 外语: integer (nullable = true)
|-- 物理: integer (nullable = true)
|-- 化学: integer (nullable = true)
|-- 生物: integer (nullable = true)
用 concat
将多列合并成一列
test_score_fold = test_score_unfold.withColumn("score_concat", concat(test_score_unfold['语文'], \
test_score_unfold['数学'], test_score_unfold['外语'],\
test_score_unfold['物理'], test_score_unfold['化学'],\
test_score_unfold['生物'] )).select("name","score_concat")
test_score_fold.show()
test_score_fold.printSchema()
+----+---------------+
|name| score_concat|
+----+---------------+
|王强|119125120967040|
|刘明|105121110717445|
|张磊|110103124865255|
+----+---------------+
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = true)
用 concat_ws
将多列合并成一列 concat_ws可选择分割符
test_score_fold_ = test_score_unfold.withColumn("score_concat", concat_ws(',',test_score_unfold['语文'], \
test_score_unfold['数学'], test_score_unfold['外语'],\
test_score_unfold['物理'], test_score_unfold['化学'],\
test_score_unfold['生物'] )).select("name","score_concat")
test_score_fold_.show()
test_score_fold_.printSchema()
+----+--------------------+
|name| score_concat|
+----+--------------------+
|王强|119,125,120,96,70,40|
|刘明|105,121,110,71,74,45|
|张磊|110,103,124,86,52,55|
+----+--------------------+
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = false)
用 split
将上面的数据转化成 array
test_score_split = test_score_fold_.withColumn("score_array", split(test_score_fold_['score_concat'], ","))
test_score_split.show(truncate=False,vertical=False)
test_score_split.printSchema()
+----+--------------------+---------------------------+
|name|score_concat |score_array |
+----+--------------------+---------------------------+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|
+----+--------------------+---------------------------+
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = false)
|-- score_array: array (nullable = false)
| |-- element: string (containsNull = true)
另一种方法
def div(data):
return data.split(',')
udf_split = udf(div, ArrayType(StringType()))
test_score_split_ = test_score_fold_.withColumn('score_array',udf_split(col('score_concat')))
test_score_split_.show(truncate=False)
test_score_split_.printSchema()
+----+--------------------+---------------------------+
|name|score_concat |score_array |
+----+--------------------+---------------------------+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|
+----+--------------------+---------------------------+
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = false)
|-- score_array: array (nullable = true)
| |-- element: string (containsNull = true)
将 array 转成 vector
def to_vec(data):
return Vectors.dense(data)
u_t = udf(to_vec, VectorUDT())
test_score_vector = test_score_split_.withColumn('score_vector',u_t(col('score_array')))
test_score_vector.show(truncate=False)
test_score_vector.printSchema()
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = false)
|-- score_array: array (nullable = true)
| |-- element: string (containsNull = true)
|-- score_vector: vector (nullable = true)
将 vector 类型 转成 array
def to_arr(data):
return [float(i) for i in data]
udfage = udf(to_arr, ArrayType(FloatType()))
test_score_array = test_score_vector.withColumn('score_vector_to_array',udfage('score_vector'))
test_score_array.show(truncate=False)
test_score_array.printSchema()
+----+--------------------+---------------------------+----------------------------------+---------------------------------------+
|name|score_concat |score_array |score_vector |score_vector_to_array |
+----+--------------------+---------------------------+----------------------------------+---------------------------------------+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|[119.0,125.0,120.0,96.0,70.0,40.0]|[119.0, 125.0, 120.0, 96.0, 70.0, 40.0]|
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|[105.0,121.0,110.0,71.0,74.0,45.0]|[105.0, 121.0, 110.0, 71.0, 74.0, 45.0]|
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|[110.0,103.0,124.0,86.0,52.0,55.0]|[110.0, 103.0, 124.0, 86.0, 52.0, 55.0]|
+----+--------------------+---------------------------+----------------------------------+---------------------------------------+
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = false)
|-- score_array: array (nullable = true)
| |-- element: string (containsNull = true)
|-- score_vector: vector (nullable = true)
|-- score_vector_to_array: array (nullable = true)
| |-- element: float (containsNull = true)
将上面转化后的数据变成多列
first_row = test_score_fold_.first()
numAttrs = len(first_row['score_concat'].split(","))
print("新增列的个数", numAttrs)
attrs = sc.parallelize(["语文","数学", "外语", "物理", "化学", "生物"]).zipWithIndex().collect()
print("列名:", attrs)
for name, index in attrs:
df_split = df_split.withColumn(name, df_split['s'][index])
df_split.show(truncate=False,vertical=False)
新增列的个数 6
列名: [('语文', 0), ('数学', 1), ('外语', 2), ('物理', 3), ('化学', 4), ('生物', 5)]
+----+--------------------+---------------------------+----+----+----+----+----+----+
|name|score_concat |s |语文|数学|外语|物理|化学|生物|
+----+--------------------+---------------------------+----+----+----+----+----+----+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|119 |125 |120 |96 |70 |40 |
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|105 |121 |110 |71 |74 |45 |
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|110 |103 |124 |86 |52 |55 |
+----+--------------------+---------------------------+----+----+----+----+----+----+
将单行数据变成多行
df_explode = test_score_fold_.withColumn("e", explode(split(test_score_fold_['score_concat'], ",")))
df_explode.show()
df_explode.printSchema()
+----+--------------------+---+
|name| score_concat| e|
+----+--------------------+---+
|王强|119,125,120,96,70,40|119|
|王强|119,125,120,96,70,40|125|
|王强|119,125,120,96,70,40|120|
|王强|119,125,120,96,70,40| 96|
|王强|119,125,120,96,70,40| 70|
|王强|119,125,120,96,70,40| 40|
|刘明|105,121,110,71,74,45|105|
|刘明|105,121,110,71,74,45|121|
|刘明|105,121,110,71,74,45|110|
|刘明|105,121,110,71,74,45| 71|
|刘明|105,121,110,71,74,45| 74|
|刘明|105,121,110,71,74,45| 45|
|张磊|110,103,124,86,52,55|110|
|张磊|110,103,124,86,52,55|103|
|张磊|110,103,124,86,52,55|124|
|张磊|110,103,124,86,52,55| 86|
|张磊|110,103,124,86,52,55| 52|
|张磊|110,103,124,86,52,55| 55|
+----+--------------------+---+
root
|-- name: string (nullable = true)
|-- score_concat: string (nullable = false)
|-- e: string (nullable = true)
a = [Row(id='ID1', tokens=['one', 'two', 'two'], bigrams=['one two', 'two two'])]
b = spark.createDataFrame(a)
b.show()
b.printSchema()
+------------------+---+---------------+
| bigrams| id| tokens|
+------------------+---+---------------+
|[one two, two two]|ID1|[one, two, two]|
+------------------+---+---------------+
root
|-- bigrams: array (nullable = true)
| |-- element: string (containsNull = true)
|-- id: string (nullable = true)
|-- tokens: array (nullable = true)
| |-- element: string (containsNull = true)
def concat_(*args):
return list(chain(*args))
lll = udf(concat_, ArrayType(StringType()))
c = b.withColumn('bein',lll(col("tokens"), col("bigrams")))
c.show(truncate=False)
c.printSchema()
+------------------+---+---------------+---------------------------------+
|bigrams |id |tokens |bein |
+------------------+---+---------------+---------------------------------+
|[one two, two two]|ID1|[one, two, two]|[one, two, two, one two, two two]|
+------------------+---+---------------+---------------------------------+
root
|-- bigrams: array (nullable = true)
| |-- element: string (containsNull = true)
|-- id: string (nullable = true)
|-- tokens: array (nullable = true)
| |-- element: string (containsNull = true)
|-- bein: array (nullable = true)
| |-- element: string (containsNull = true)