from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession,functions as F
from pyspark.sql.types import StringType,MapType
from pyspark.sql.types import*
#select userid ,docid from data -- sql语句
df.select('userid','docid').show()#select userid as uid from data
df.selectExpr('userid as uid').show()# percent to double
df.selectExpr('cast(percent as double)').printSchema()#转换类型cast#select docid,ctr * 2 as ctr from data
df.selectExpr('docid','ctr * 2 as ctr').show()
sum,agg
#select userid,docid sum(expose) expose,sum(click) click from data group by userid,docid
df.groupBy("userid","docid").agg({"expose":"sum","click":"sum"}).withColumnRenamed("sum(expose)","expose").withColumnRenamed("sum(click)","click")
filter
#select docid, ctr from data where docid="123" and ctr > 0.3
df.select("docid","ctr").filter('docid== 2 and ctr > 0.3').show()