知识点:
1、使用pyspark读取csv:spark.read.format("csv").load('/user/data.csv',header=True, inferSchema="true")
2、dataframe补充空值:fillna()
3、dataframe字段表示方式:"APP_HOBY_CASH_LOAN"或df.APP_HOBY_CASH_LOAN或data_df["APP_HOBY_CASH_LOAN"]
pyspark dataframe使用astype实现dataframe字段类型转换
-
# 两种读取csv文件的方式
-
data_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("/user/data.csv")
-
data_df = spark.read.format("csv").load('/user/data.csv',header=True, inferSchema="true")
-
# 补充空值方法
-
data_df = data_df.fillna(0)
-
# 打印表结构 第一种执行较快
-
print(df_from_java.columns)
-
data_df.printSchema()
-
print(df_from_java.schema())
-
# 第一种方式
-
data_df = df.withColumn("APP_HOBY_CASH_LOAN", df.APP_HOBY_CASH_LOAN.cast('float'))
-
# 第二种方式
-
from pyspark.sql.types import IntegerType
-
data_df = data_df.withColumn("APP_HOBY_CASH_LOAN", data_df["APP_HOBY_CASH_LOAN"].cast(IntegerType()))
pandas dataframe使用astype实现dataframe字段类型转换
-
# -*- coding: UTF-8 -*-
-
import pandas as pd
-
df = pd.DataFrame([{'col1':'a', 'col2':'1'}, {'col1':'b', 'col2':'2'}])
-
print df.dtypes
-
df['col2'] = df['col2'].astype('int')
-
print '-----------'
-
print df.dtypes
-
df['col2'] = df['col2'].astype('float64')
-
print '-----------'
-
print df.dtypes
pandas和pyspark的dataframe互转
-
#!/usr/bin/python
-
# -*- coding: utf-8 -*-
-
from pyspark import SparkConf
-
from pyspark.sql import SparkSession
-
import pandas as pd
-
# pyspark dataframe转pandas dataframe
-
conf = SparkConf()
-
conf.set("spark.yarn.queue", "root") \
-
.set("spark.app.name", "lbs_coordinate")
-
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
-
pyspark_df = spark.sql("select * from tmp.test_table")
-
pandas_pd=pyspark_df.toPandas()
-
# pandas dataframe转pyspark dataframe
-
pandas_df = pd.DataFrame([{'col1':'a', 'col2':'1'}, {'col1':'b', 'col2':'2'}])
-
pyspark_df = spark.createDataFrame(pandas_df)