Spark SQL in Python

df = spark.read.csv(filename, header = True)   

#create SQL table and query it
df.createOrReplaceTempView("schedule")
spark.sql("select * from schedule where station = 'San Jose' ").show()

#inspecting table schma
result = spark.sql("show columns from tablename")
result = spark.sql("select * from tablename limit 0")
result = spark.sql("describe tablename")
result.show()
print(result.columns)

#loading delimited text
df = spark.read.csv("trainsched.txt", header = True)
df.show()
Window function SQL

OVER clause and ORDER BY clause
在这里插入图片描述

query = """
select train_id, station, time,
lead(time, 1) over (order by time) as time_next
from sched
where train_id = 324 """

spark.sql(query).show()

PARTITION BY clause

select
train_id,
station,
time,
lead(time, 1) over (partition by train_id order by time) as time_next
from sched

Three ways to select 2 columns

df.select('train_id', 'station')
df.select(df.train_id, df.station)
from pyspark.sql.functions import col
df.select(col('train_id'), col('station'))

Two ways to rename a column

df.select('train_id', 'station').withColumnRenamed('train_id', 'train').show()

df.select(col('train_id').alias('train'), 'station')
SQL queries using dot notation
spark.sql('select train_id as train, station from schedule limit 5')
# 等价于
df.select(col('train_id').alias('train'), 'station').limit(5).show()

Window function using dot notation

from pyspark.sql import Window,
from pyspark.sql.functions import row_number
df.withColumn("id", row_number()
                       .over(
                            Window.partitionBy('train_id')
                                   .orderBy('time')
                              )
            )
#等价于
query = """
select *,
row_number() over(partition by train_id order by time) as id
from schedule
"""
spark.sql(query)
      .show(11)

Using a WindowSpec

window = Window.partitionBy('traid_id').orderBy('time')
dfx = df.withColumn('next', lead('time', 1).over(window))

window = Window.partitionBy('train_id').orderBy('time')
dot_df = df.withColumn('diff_min', 
                    (unix_timestamp(lead('time', 1).over(window),'H:m') 
                     - unix_timestamp('time', 'H:m'))/60)

from pyspark.sql.functions import min, max, col
expr = [min(col("time")).alias('start'), max(col("time")).alias('end')]
dot_df = df.groupBy("train_id").agg(expr)
dot_df.show()

using-window-function-sql-for-natural-language


```python
Loading text
df = spark.read.text('sherlock.txt')
print(df.first())
print(df.count())

Loading parquet

df1 = spark.read.load('sherlock.parquet')
df1.show(15, truncate = False)

Lower case operation

df = df1.select(lower(col('value')))
print(df.first())
df.columns

Alias operation

df = df1.select(lower(col('value')).alias('v'))
df.columns

Replacing text

df = df1.select(regexp_replace('value', 'Mr\.', 'Mr').alias('v'))
df = df1.select(regexp_replace('value', 'don\'t', 'do not').alias('v'))

Tokenizing text

df = df2.select(split('v', '[]').alias('words'))
df.show(truncate=False)

Split characters are discarded

punctuation = "_|.\?\!",\'\[\]\*()"
df3 = df2.select(split('v', '[%s]' % punctuation).alias('words'))
df3.show(truncate=False)

Exploding an array

df4 = df3.select(explode('words').alias('word'))
df4.show()
print(df3.count())

Removing empty rows

nonblank_df = df.where(length('word') > 0)

Adding a row id column

df2 = df.select('word', monotonically_increasing_id().alias('id'))

Partitioning the data

df2 = df.withColumn('title', when(df.id < 25000, 'Preface')
                            .when(df.id < 50000, 'Chapter 1')
                            .when(df.id < 75000, 'Chapter 2')
                            .otherwise('Chapter 3'))
df2 = df2.withColumn('part', when(df2.id < 25000, 0)
                            .when(df2.id < 50000, 1)
                            .when(df2.id < 75000, 2)
                            .otherwise(3)).show()

Repartitioning on a column

df2 = df,repartition(4, 'part')
print(df2.rdd.getNumPartitions())

Reading pre-partitioned text

$ ls sherlock_parts
df_parts = spark.read.text('sherlock_parts')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值