Pyspark笔记

最新推荐文章于 2021-09-09 15:56:14 发布

盒饭立flag

最新推荐文章于 2021-09-09 15:56:14 发布

阅读量203

点赞数

分类专栏：分析工具文章标签： spark

本文链接：https://blog.csdn.net/weixin_43973436/article/details/106769287

版权

分析工具专栏收录该内容

2 篇文章 0 订阅

订阅专栏

pyspark

基础操作

from pyspark.sql.types import DateType, TimestampType, IntegerType, FloatType, LongType, DoubleType
from pyspark.sql.types import StructType, StructField

### 数据读取
df = spark.read.format('com.databricks.spark.csv').options(header='False', inferschema='true', sep='|').load('/tmp/test/mycsv/part-00001-aa0bd811-6469-4102-898c-04448e873de8-c000.csv')

### 数据探索
df.count()
df.show()
df.take(5)
df.columns
df.describe('_c3', '_c6').show()

### 数据删选
df_1 = df.select(['_c2', '_c3', '_c4', '_c5'])

### 数据重命名
old_names = ['_c2', '_c3', '_c4', '_c5']
new_names = ['servicer_name', 'new_int_rt', 'act_endg_upb', 'loan_age']

for old, new in zip(old_names, new_names):
    df_1 = df_1.withColumnRenamed(old, new)

### 数据保存
df_1.write.format('com.databricks.spark.csv').option("header", "true").save('/tmp/test/mycsv')

### 增加一列
df_1 = df_lim.withColumn('loan_length', df_lim['loan_age'] + df_1['mths_remng'])

## Groupby Compute
df_grp = df_1.groupBy('servicer_name').avg('loan_age', 'mths_remng', 'loan_length')

SQL操作

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext

conf = SparkConf().setMaster("spark://127.0.0.1:7077").setAppName("sparkTest").set("spark.executor.memory","1g")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

df = sqlContext.sql("""select vin from test.base_data_estimation limit 10""")

DataFrame操作

from pyspark.sql import Row
from pyspark.sql import DataFrame
from functools import reduce

row = Row("name", "pet", "count")

df1 = sc.parallelize([row("Sue", "cat", 16), row("Kim", "dog", 1),row("Bob", "fish", 5)]).toDF()
df2 = sc.parallelize([row("Fred", "cat", 2), row("Kate", "ant", 179), row("Marc", "lizard", 5)]).toDF()
df3 = sc.parallelize([row("Sarah", "shark", 3), row("Jason", "kids", 2), row("Scott", "squirrel", 1)]).toDF()

def union_many(*dfs):
    return reduce(DataFrame.unionAll, dfs)

df_union = union_many(df1, df2, df3)

row1 = Row("name", "pet", "count")
row2 = Row("name", "pet2", "count2")

df1 = sc.parallelize([row1("Sue", "cat", 16), row1("Kim", "dog", 1), row1("Bob", "fish", 5), row1("Libuse", "horse", 1)]).toDF()

df2 = sc.parallelize([row2("Sue", "eagle", 2), row2("Kim", "ant", 179), row2("Bob", "lizard", 5),row2("Ferdinand", "bees", 23)]).toDF()

df1.join(df2, 'name', how='inner').show()
df1.join(df2, 'name', how='outer').show()
df1.join(df2, 'name', how='left').show()
df.where( df['_c12'].isNull() ).count()

def count_nulls(df):
    null_counts = []          # make an empty list to hold our results
    for col in df.dtypes:     # iterate through the column data types we saw above, e.g. ('C0', 'bigint')
        cname = col[0]        # splits out the column name, e.g. 'C0'    
        ctype = col[1]        # splits out the column type, e.g. 'bigint'
        if ctype != 'string': # skip processing string columns for efficiency (can't have nulls)
            nulls = df.where( df[cname].isNull() ).count()
            result = tuple([cname, nulls])  # new tuple, (column name, null count)
            null_counts.append(result)      # put the new tuple in our result list
    return null_counts

null_counts = count_nulls(df)
df_drops = df.dropna(how='all', subset=['_c4', '_c12', '_c26'])
df_drops2 = df.dropna(thresh=2, subset=['_c4', '_c12', '_c26'])
df_fill = df.fillna(0, subset=['_c12'])
df_fill.where( df_fill['_c12'].isNull() ).count()

Window操作

from pyspark.sql import Window
from pyspark.sql.functions import mean
from pyspark.sql.functions import when, col

window = Window.partitionBy('cut', 'clarity').orderBy('price').rowsBetween(-3, 3)
moving_avg = mean(df['price']).over(window)
df = df.withColumn('moving_avg', moving_avg)

def replace_null(orig, x):
    return when(orig.isNull(), x).otherwise(orig)
    
df_new = df.withColumn('imputed',replace_null(col('price'), col('moving_avg')))
df_new.where(df['price'].isNull()).show(50)

Pivoting操作

from pyspark.sql import Row

row = Row('state', 'industry', 'hq', 'jobs')

df = sc.parallelize([
    row('MI', 'auto', 'domestic', 716),  row('MI', 'auto', 'foreign', 123),
    row('MI', 'auto', 'domestic', 1340), row('MI', 'retail', 'foreign', 12),
    row('MI', 'retail', 'foreign', 33),  row('OH', 'auto', 'domestic', 349),
    row('OH', 'auto', 'foreign', 101),   row('OH', 'auto', 'foreign', 77),
    row('OH', 'retail', 'domestic', 45), row('OH', 'retail', 'foreign', 12)]).toDF()

df_pivot1 = df.groupby('state').pivot('hq', values=['domestic', 'foreign']).sum('jobs')
df_pivot = df.groupBy('state', 'industry').pivot('hq', values=['domestic', 'foreign']).sum('jobs')

row = Row('state', 'industry', 'hq', 'jobs', 'firm')

df = sc.parallelize([
    row('MI', 'auto', 'domestic', 716, 'A'), row('MI', 'auto', 'foreign', 123, 'B'),
    row('MI', 'auto', 'domestic', 1340, 'C'),row('MI', 'retail', 'foreign', 12, 'D'),
    row('MI', 'retail', 'foreign', 33, 'E'), row('OH', 'retail', 'mixed', 978, 'F'),
    row('OH', 'auto', 'domestic', 349, 'G'), row('OH', 'auto', 'foreign', 101, 'H'),
    row('OH', 'auto', 'foreign', 77, 'I'),   row('OH', 'retail', 'domestic', 45, 'J'),
    row('OH', 'retail', 'foreign', 12, 'K'), row('OH', 'retail', 'mixed', 1, 'L'),
    row('OH', 'auto', 'other', 120, 'M'),    row('OH', 'auto', 'domestic', 96, 'A'),
    row('MI', 'auto', 'foreign', 1117, 'A'), row('MI', 'auto', 'foreign', 11, 'B')]).toDF()

df_pivot = df.groupBy('firm', 'state', 'industry').pivot('hq', values=['domestic', 'foreign', 'mixed', 'other']).sum('jobs')

盒饭立flag

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Pyspark笔记

PysparkPyspark基础操作from pyspark.sql.types import DateType, TimestampType, IntegerType, FloatType, LongType, DoubleTypefrom pyspark.sql.types import StructType, StructField### 数据读取custom_schema = StructType([StructField('_c0', DateType(), True)])df =
复制链接

扫一扫