今天尝试了下DataFrame的常用属性和方法。
SparkSession spark = SparkSession.Builder().AppName("xfj_xfr").GetOrCreate();
DataFrame xfrdf = spark.Read()
.Option("Delimiter", "|")
.Option("Encoding", "UTF-8")
//.Schema("id STRING,xm STRING,xb STRING,zz STRING,zzdm STRING,nl STRING,sfzhm STRING,hjdz STRING,zy STRING,csny STRING,xl STRING,mz STRING,sjhm STRING,gzdw STRING,gddh STRING,xfjbh STRING,txdz STRING,xg STRING,ah STRING,wxdj STRING,sfzdry STRING,zdryzb STRING,sfxfln STRING,xfrid STRING,yzbm STRING,xh STRING")
.Schema("xfjbh STRING,xm STRING,sfzhm STRING,zz STRING,xb INT")
.Csv("xfr.csv");
DataFrame xfjdf = spark.Read()
.Option("Delimiter", "|")
.Option("Encoding", "UTF-8")
.Schema("xfjbh STRING,djsj STRING,djjgmc STRING,gkxx STRING,cfxfbz INT")
.Csv("xfj.csv");
xfrdf.Show();//打印DataFramewor
xfrdf.Show(numRows: 50, truncate: 500, vertical:true);//numRow行数;truncate宽度,超过就...;vertical垂直模式;
xfrdf.First();//第一行数据
xfrdf.Head();//第一行数据
for (int i = 0; i < xfrdf.Columns().Count; i++)
{
Console.WriteLine(xfrdf.Columns()[i]+":"+xfrdf.First()[i]+"\t");
}
foreach (var item in xfjdf.Columns())
{
Console.WriteLine(item);
}//打印所有列
xfrdf.Explain(true);//打印执行计划
xfrdf.IsLocal();//本地模式
xfrdf.IsStreaming();//流模式
xfrdf.PrintSchema();//打印结构
xfrdf.Persist();//返回一个dataframe.this.type 输入存储模型类型----没看懂
DataFrame xfjdfFilter= xfjdf.ToDF(new string[] { "xfjbh","djsj","djjgmc","tsnr","jabz"});//这是个坑,意思就是复制一份然后列名可以换,我以为是选取其中的某几列
xfjdfFilter.Show();
xfrdf.Agg(Functions.Max("xfjbh")).Show();//执行函数,获取xfjbh列最大值
xfrdf.GroupBy(Functions.Col("xfjbh")).Count().Show();//xfjbh分组
xfrdf.GroupBy(xfrdf.Col("xfjbh")).Count().Show();//换种写法
xfrdf.GroupBy(Functions.Col("xfjbh")).Avg("xb").Show();//xfjbh分组,性别平均数,哈哈哈哈哈
xfrdf.Distinct().Show();//去重
xfrdf.Drop("zz").Show();//删除zz返回DataFrame
xfrdf.As("xfrdf1").Show();//别名,不知道有啥用;
xfrdf.DropDuplicates();//删除相同的列???,可以有列相同吗?
xfrdf.Except(xfjdf).Show();//返回在当前集合存在的在其他集合不存在的
xfrdf.Intersect(xfjdf).Show();//交集
DataFrame xfr_xfj = xfrdf.Join(xfjdf, "xfjbh");//xfjbh外键关联
xfr_xfj.Filter("xfjbh='20191210134612347'").Show();//过滤
xfr_xfj.OrderBy(Functions.Col("xfjbh")).Show();//排序
xfr_xfj.Limit(2).Show();// 返回dataframe类型 取2条数据出来
xfr_xfj.Na();//删除空行
xfr_xfj.WithColumnRenamed("xfjbh", "xfbm").Show();//更改列名
xfr_xfj.WithColumn("xfbm", xfr_xfj["xfjbh"]).Show();//增加一列
spark.Stop();