df = spark.read.csv(filename, header = True)
#create SQL table and query it
df.createOrReplaceTempView("schedule")
spark.sql("select * from schedule where station = 'San Jose' ").show()
#inspecting table schma
result = spark.sql("show columns from tablename")
result = spark.sql("select * from tablename limit 0")
result = spark.sql("describe tablename")
result.show()
print(result.columns)
#loading delimited text
df = spark.read.csv("trainsched.txt", header = True)
df.show()
Window function SQL
OVER clause and ORDER BY clause
query = """
select train_id, station, time,
lead(time, 1) over (order by time) as time_next
from sched
where train_id = 324 """
spark.sql(query).show()
PARTITION BY clause
select
train_id,
station,
time,
lead(time, 1) over (partition by train_id order by time) as time_next
from sched
Three ways to select 2 columns
df.select('train_id', 'station')
df.select(df.train_id, df.station)
from pyspark.sql.functions import col
df.select(col('train_id'), col('station'))
Two ways to rename a column
df.select('train_id', 'station').withColumnRenamed('train_id', 'train').show()
df.select(col('train_id').alias('train'), 'station')
SQL queries using dot notation
spark.sql('select train_id as train, station from schedule limit 5')
# 等价于
df.select(col('train_id').alias('train'), 'station').limit(5).show()
Window function using dot notation
from pyspark.sql import Window,
from pyspark.sql.functions import row_number
df.withColumn("id", row_number()
.over(
Window.partitionBy('train_id')
.orderBy('time')
)
)
#等价于
query = """
select *,
row_number() over(partition by train_id order by time) as id
from schedule
"""
spark.sql(query)
.show(11)
Using a WindowSpec
window = Window.partitionBy('traid_id').orderBy('time')
dfx = df.withColumn('next', lead('time', 1).over(window))
window = Window.partitionBy('train_id').orderBy('time')
dot_df = df.withColumn('diff_min',
(unix_timestamp(lead('time', 1).over(window),'H:m')
- unix_timestamp('time', 'H:m'))/60)
from pyspark.sql.functions import min, max, col
expr = [min(col("time")).alias('start'), max(col("time")).alias('end')]
dot_df = df.groupBy("train_id").agg(expr)
dot_df.show()
using-window-function-sql-for-natural-language
```python
Loading text
df = spark.read.text('sherlock.txt')
print(df.first())
print(df.count())
Loading parquet
df1 = spark.read.load('sherlock.parquet')
df1.show(15, truncate = False)
Lower case operation
df = df1.select(lower(col('value')))
print(df.first())
df.columns
Alias operation
df = df1.select(lower(col('value')).alias('v'))
df.columns
Replacing text
df = df1.select(regexp_replace('value', 'Mr\.', 'Mr').alias('v'))
df = df1.select(regexp_replace('value', 'don\'t', 'do not').alias('v'))
Tokenizing text
df = df2.select(split('v', '[]').alias('words'))
df.show(truncate=False)
Split characters are discarded
punctuation = "_|.\?\!",\'\[\]\*()"
df3 = df2.select(split('v', '[%s]' % punctuation).alias('words'))
df3.show(truncate=False)
Exploding an array
df4 = df3.select(explode('words').alias('word'))
df4.show()
print(df3.count())
Removing empty rows
nonblank_df = df.where(length('word') > 0)
Adding a row id column
df2 = df.select('word', monotonically_increasing_id().alias('id'))
Partitioning the data
df2 = df.withColumn('title', when(df.id < 25000, 'Preface')
.when(df.id < 50000, 'Chapter 1')
.when(df.id < 75000, 'Chapter 2')
.otherwise('Chapter 3'))
df2 = df2.withColumn('part', when(df2.id < 25000, 0)
.when(df2.id < 50000, 1)
.when(df2.id < 75000, 2)
.otherwise(3)).show()
Repartitioning on a column
df2 = df,repartition(4, 'part')
print(df2.rdd.getNumPartitions())
Reading pre-partitioned text
$ ls sherlock_parts
df_parts = spark.read.text('sherlock_parts')