一.导入
pandas(pd):
import pandas as pd
pyspark(ps):
import pyspark.pandas as ps
spark(sp):
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
二.语法对比
1.建表
pd:
data = {'水果':['苹果', '香蕉', '草莓'],
'数量':[13,16, 25],
'价格':[10, 7, 8]}
pd_df = pd.DataFrame(data)
print(pd_df)
ps:
ps_df = ps.DataFrame(data)
print(ps_df)
sp:
data_sp = [{'水果':'苹果', '数量':13, '价格':10},
{'水果':'香蕉', '数量':16, '价格':7},
{'水果':'草莓', '数量':25, '价格':8}]
sp_df = spark.createDataFrame(data_sp)
sp_df.show()
注意:建表数据格式不一样哦
2.相互转换
pandas to spark:
sp_df=spark.createDataFrame(pd_df)
pandas to pyspark:
ps_df=ps.from_pandas(pd_df)
spark to pandas:
pd_df = sp_df.toPandas()
spark to pyspark:
ps_df = sp_df.pandas_api()
pyspark to pandas:
pd_df = ps_df.to_pandas()
pyspark to spark:
pd_df = ps_df.to_pandas()
sp_df=spark.createDataFrame(pd_df)
3.新增一列
pd、ps:
#增加常数列
df['重量'] = 15
#对其他列值操作得新列
df['重量'] = df['数量'] * 2
spark:
#增加常数列
from pyspark.sql.functions import lit
df= df.withColumn('重量', lit(15))
#对其他列值操作得新列
df= df.withColumn('重量', df['数量'] * 2)
4.将一列变为list
pd、ps:
df['数量'].tolist()
spark:
sp_df[['数量']].collect()
#[Row(数量=13), Row(数量=16), Row(数量=25)]
[i[0] for i in sp_df[['数量']].collect()]
#[13, 16, 25]
5.读文件
ps、pd:
df = pd.read_csv()
df.to_csv()
spark:
df = spark.read.csv()
#里面有参数,如果读出列名有问题可以加一下 header=True, inferSchema='True'
df.write.csv()
6.删除列
pd:
del df['重量']
ps:
df.drop(columns=['重量'])
spark:
df.drop('重量')
7.取小数位数round
pd:
df['重量'] = round(df['重量']/2, 2)
ps:
df['重量']/2.round(2)
spark:
df = df.withColumn('重量', F.round(df['重量']/2, 2))
8.纵向合并df
pd,ps:
df_list= [df, df1, df2...]
df = pd.concat(dfs)
spark:
from functools import reduce
from pyspark.sql import DataFrame
def unionAll(*dfs):
return reduce(DataFrame.unionAll, dfs)
df_list= [df, df1, df2...]
df = unionAll(*df_list)
9.重命名列名
pd、ps:
#任意修改
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.rename(columns={"A": "a", "B": "c"})
#全部重命名
df.columns = ["a", "c"]
spark:
#单个列
df = df.withColumnRenamed(old, new)
#同时命名多列
columns={"A": "a", "B": "c"}
def rename_columns(df, columns):
for old, new in columns.items():
df = df.withColumnRenamed(old, new)
return df
df = rename_columns(df, columns)
未完待续,随时更新...