![pyspark](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9zczIuYmRzdGF0aWMuY29tLzcwY0Z2blNoX1ExWW54R2twb1dLMUhGNmhoeS9pdC91PTQyMzY3NTY4MjYsNDIyNjUzNjAyMiZmbT0xMSZncD0wLmpwZw?x-oss-process=image/format,png)
基础操作
from pyspark.sql.types import DateType, TimestampType, IntegerType, FloatType, LongType, DoubleType
from pyspark.sql.types import StructType, StructField
df = spark.read.format('com.databricks.spark.csv').options(header='False', inferschema='true', sep='|').load('/tmp/test/mycsv/part-00001-aa0bd811-6469-4102-898c-04448e873de8-c000.csv')
df.count()
df.show()
df.take(5)
df.columns
df.describe('_c3', '_c6').show()
df_1 = df.select(['_c2', '_c3', '_c4', '_c5'])
old_names = ['_c2', '_c3', '_c4', '_c5']
new_names = ['servicer_name', 'new_int_rt', 'act_endg_upb', 'loan_age']
for old, new in zip(old_names, new_names):
df_1 = df_1.withColumnRenamed(old, new)
df_1.write.format('com.databricks.spark.csv').option("header", "true").save('/tmp/test/mycsv')
df_1 = df_lim.withColumn('loan_length', df_lim['loan_age'] + df_1['mths_remng'])
df_grp = df_1.groupBy('servicer_name').avg('loan_age', 'mths_remng', 'loan_length')
SQL操作
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
conf = SparkConf().setMaster("spark://127.0.0.1:7077").setAppName("sparkTest").set("spark.executor.memory","1g")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)
df = sqlContext.sql("""select vin from test.base_data_estimation limit 10""")
DataFrame操作
from pyspark.sql import Row
from pyspark.sql import DataFrame
from functools import reduce
row = Row("name", "pet", "count")
df1 = sc.parallelize([row("Sue", "cat", 16), row("Kim", "dog", 1),row("Bob", "fish", 5)]).toDF()
df2 = sc.parallelize([row("Fred", "cat", 2), row("Kate", "ant", 179), row("Marc", "lizard", 5)]).toDF()
df3 = sc.parallelize([row("Sarah", "shark", 3), row("Jason", "kids", 2), row("Scott", "squirrel", 1)]).toDF()
def union_many(*dfs):
return reduce(DataFrame.unionAll, dfs)
df_union = union_many(df1, df2, df3)
row1 = Row("name", "pet", "count")
row2 = Row("name", "pet2", "count2")
df1 = sc.parallelize([row1("Sue", "cat", 16), row1("Kim", "dog", 1), row1("Bob", "fish", 5), row1("Libuse", "horse", 1)]).toDF()
df2 = sc.parallelize([row2("Sue", "eagle", 2), row2("Kim", "ant", 179), row2("Bob", "lizard", 5),row2("Ferdinand", "bees", 23)]).toDF()
df1.join(df2, 'name', how='inner').show()
df1.join(df2, 'name', how='outer').show()
df1.join(df2, 'name', how='left').show()
df.where( df['_c12'].isNull() ).count()
def count_nulls(df):
null_counts = []
for col in df.dtypes:
cname = col[0]
ctype = col[1]
if ctype != 'string':
nulls = df.where( df[cname].isNull() ).count()
result = tuple([cname, nulls])
null_counts.append(result)
return null_counts
null_counts = count_nulls(df)
df_drops = df.dropna(how='all', subset=['_c4', '_c12', '_c26'])
df_drops2 = df.dropna(thresh=2, subset=['_c4', '_c12', '_c26'])
df_fill = df.fillna(0, subset=['_c12'])
df_fill.where( df_fill['_c12'].isNull() ).count()
Window操作
from pyspark.sql import Window
from pyspark.sql.functions import mean
from pyspark.sql.functions import when, col
window = Window.partitionBy('cut', 'clarity').orderBy('price').rowsBetween(-3, 3)
moving_avg = mean(df['price']).over(window)
df = df.withColumn('moving_avg', moving_avg)
def replace_null(orig, x):
return when(orig.isNull(), x).otherwise(orig)
df_new = df.withColumn('imputed',replace_null(col('price'), col('moving_avg')))
df_new.where(df['price'].isNull()).show(50)
Pivoting操作
from pyspark.sql import Row
row = Row('state', 'industry', 'hq', 'jobs')
df = sc.parallelize([
row('MI', 'auto', 'domestic', 716), row('MI', 'auto', 'foreign', 123),
row('MI', 'auto', 'domestic', 1340), row('MI', 'retail', 'foreign', 12),
row('MI', 'retail', 'foreign', 33), row('OH', 'auto', 'domestic', 349),
row('OH', 'auto', 'foreign', 101), row('OH', 'auto', 'foreign', 77),
row('OH', 'retail', 'domestic', 45), row('OH', 'retail', 'foreign', 12)]).toDF()
df_pivot1 = df.groupby('state').pivot('hq', values=['domestic', 'foreign']).sum('jobs')
df_pivot = df.groupBy('state', 'industry').pivot('hq', values=['domestic', 'foreign']).sum('jobs')
row = Row('state', 'industry', 'hq', 'jobs', 'firm')
df = sc.parallelize([
row('MI', 'auto', 'domestic', 716, 'A'), row('MI', 'auto', 'foreign', 123, 'B'),
row('MI', 'auto', 'domestic', 1340, 'C'),row('MI', 'retail', 'foreign', 12, 'D'),
row('MI', 'retail', 'foreign', 33, 'E'), row('OH', 'retail', 'mixed', 978, 'F'),
row('OH', 'auto', 'domestic', 349, 'G'), row('OH', 'auto', 'foreign', 101, 'H'),
row('OH', 'auto', 'foreign', 77, 'I'), row('OH', 'retail', 'domestic', 45, 'J'),
row('OH', 'retail', 'foreign', 12, 'K'), row('OH', 'retail', 'mixed', 1, 'L'),
row('OH', 'auto', 'other', 120, 'M'), row('OH', 'auto', 'domestic', 96, 'A'),
row('MI', 'auto', 'foreign', 1117, 'A'), row('MI', 'auto', 'foreign', 11, 'B')]).toDF()
df_pivot = df.groupBy('firm', 'state', 'industry').pivot('hq', values=['domestic', 'foreign', 'mixed', 'other']).sum('jobs')