1、初始化:
a)声明:
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import IntegerType
b)创建SQL上下文环境:
sc = SparkContext(conf = SparkConf().setAppName('local'))
sqlContext = SQLContext(sc)
2、数据输入:
a)读取json文件:
data = sqlContext.read.json("testweet.json")
b)list数据转换:
datalist = sc.parallelize([Row(name = "holden", favourite = "coffee"), Row(name = "golden", favourite = "tea")])
data = sqlContext.createDataFrame(datalist)
data的类型是DataFrame。
3、DataFrame操作:
data. printSchema():输出结构信息;
data.select(“col_name”).show():输出列col_name;
data.filter(data.col_name > num).show():输出列col_name中大于num的部分;
data.groupBy(“col_name”).count().show():输出排序后的列col_name,并给出每个值的出现次数。
4、注册为临时表,使用sql语句查询:
a)注册:
data.registerTempTable("table") #table为表名
b)sql查询:
ds = sqlContext.sql("""SELECT * FROM table""")
c)自定义函数:
sqlContext.registerFunction("strLen", lambda x: len(x), IntegerType()) #自定义函数
strLen ds = sqlContext.sql("""SELECT strLen(col) FROM table LIMIT 10""")
5、完整代码
(1)
1 sc = SparkContext(conf = SparkConf().setAppName('local')) 2 sqlContext = SQLContext(sc) 3 4 hpRdd = sc.parallelize([Row(name = "holden", favourite = "coffee"), Row(name = "golden", favourite = "tea")]) 5 hpSchemaRdd = sqlContext.createDataFrame(hpRdd) 6 hpSchemaRdd.select('name').show() 7 8 hpSchemaRdd.registerTempTable("happypeople") 9 hpselect = sqlContext.sql("""SELECT * FROM happypeople WHERE favourite = 'tea'""") 10 print hpselect.collect() 11 12 sc.stop()
(2)
sc = SparkContext(conf = SparkConf().setAppName('local')) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) #读取并查询json数据 input = sqlContext.read.json("testweet.json") input.printSchema() #输出json的结构信息:各项key及value的取值类型 input.select("retweetCount").show() #选取指定列输出 input.filter(input.retweetCount == 0).show() #筛选出retweetCount值为0的记录输出 input.registerTempTable("tweets") #将输入(SchemaRDD)注册为临时表 topTweets = sqlContext.sql("""SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10""") topTweets = sqlContext.sql("""SELECT text, retweetCount FROM tweets WHERE retweetCount > 10""") print topTweets.collect() topTweetText = topTweets.map(lambda row: row.text) print topTweetText