获取一个列的方法
- 直接列的名称
- col()
- apply()
- expr函数
- lit()返回个常量
- column()
其它api
withColumn 增加一列
withColumnRenamed 列重命名
drop 删除列
toDF 批量改列的名称
代码示例
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import static org.apache.spark.sql.functions.*;
public class test_26 {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.config("spark.driver.host", "localhost")
.appName("ColumnInDFTest")
.master("local")
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
Dataset<Row> personDf = spark.read().json(Utils.BASE_PATH + "/people.json");
//直接写列的名称
personDf.select("name","age").show();
/*
+-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
//获取所有的column
personDf.columns();
//col方法获取一个列
Column nameColumn = personDf.col("name");
Column ageColumn = personDf.col("age");
personDf.select(nameColumn,ageColumn).show();
/*
+-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
//apply方法底层调用的col方法
nameColumn = personDf.apply("name");
ageColumn = personDf.apply("age");
personDf.select(nameColumn,ageColumn).show();
/*
+-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
//利用function来构建Column
expr("age + 1");
personDf.select(expr("age + 1")).show();
/*
+---------+
|(age + 1)|
+---------+
| 30|
| 31|
| 20|
+---------+
*/
//用常量创建一个列
lit("abc");
personDf.select(lit("abc")).show();
/*
+---+
|abc|
+---+
|abc|
|abc|
|abc|
+---+
*/
col("age");
personDf.filter(col("age").gt(19)).show();
/*
+---+-------+
|age| name|
+---+-------+
| 29|Michael|
| 30| Andy|
+---+-------+
*/
//column来获取一个列
column("name");
personDf.select(column("name"),column("age")).show();
/*
+-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
//新建一列big_age,值采用列age的值
personDf.withColumn("big_age", col("age")).show();
/*
+---+-------+-------+
|age| name|big_age|
+---+-------+-------+
| 29|Michael| 29|
| 30| Andy| 30|
| 19| Justin| 19|
+---+-------+-------+
*/
//重命名列
personDf.withColumnRenamed("age", "big_age").show();
/*
+-------+-------+
|big_age| name|
+-------+-------+
| 29|Michael|
| 30| Andy|
| 19| Justin|
+-------+-------+
*/
//删除列
personDf.drop("age").show();
/*
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
*/
personDf.show();
personDf.printSchema();
//个人理解就是改名,数量要一致,顺序也要一致
personDf.toDF("old_age","first_name").show();
/*
+-------+----------+
|old_age|first_name|
+-------+----------+
| 29| Michael|
| 30| Andy|
| 19| Justin|
+-------+----------+
*/
spark.stop();
}
}