pyspark读写dataframe
- 1. 连接spark
- 2. 创建dataframe
- 2.1. 从变量创建
- 2.2. 从变量创建
- 2.3. 读取json
- 2.4. 读取csv
- 2.5. 读取MySQL
- 2.6. 从pandas.dataframe创建
- 2.7. 从列式存储的parquet读取
- 2.8. 从hive读取
- 3. 保存数据
- 3.1. 写到csv
- 3.2. 保存到parquet
- 3.3. 写到hive
- 3.4. 写到hdfs
- 3.5. 写到mysql
1. 连接spark
-
from pyspark.sql import SparkSession
-
spark=SparkSession \
-
.builder \
-
.appName('my_first_app_name') \
-
.getOrCreate()
2. 创建dataframe
2.1. 从变量创建
-
# 生成以逗号分隔的数据
-
stringCSVRDD = spark.sparkContext.parallelize([
-
(123, "Katie", 19, "brown"),
-
(234, "Michael", 22, "green"),
-
(345, "Simone", 23, "blue")
-
])
-
# 指定模式, StructField(name,dataType,nullable)
-
# 其中:
-
# name: 该字段的名字,
-
# dataType:该字段的数据类型,
-
# nullable: 指示该字段的值是否为空
-
from pyspark.sql.types import StructType, StructField, LongType, StringType # 导入类型
-
schema = StructType([
-
StructField("id", LongType(), True),
-
StructField("name", StringType(), True),
-
StructField("age", LongType(), True),
-
StructField("eyeColor", StringType(), True)
-
])
-
# 对RDD应用该模式并且创建DataFrame
-
swimmers = spark.createDataFrame(stringCSVRDD,schema)
-
# 利用DataFrame创建一个临时视图
-
swimmers.registerTempTable("swimmers")
-
# 查看DataFrame的行数
-
swimmers.count()
2.2. 从变量创建
-
# 使用自动类型推断的方式创建dataframe
-
data = [(123, "Katie", 19, "brown"),
-
(234, "Michael", 22, "green"),
-
(345, "Simone", 23, "blue")]
-
df = spark.createDataFrame(data, schema=['id', 'name', 'age', 'eyccolor'])
-
df.show()
-
df.count()
2.3. 读取json
-
# 读取spark下面的示例数据
-
file = r"D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\people.json"
-
df = spark.read.json(file)
-
df.show()
2.4. 读取csv
-
# 先创建csv文件
-
import pandas as pd
-
import numpy as np
-
df=pd.DataFrame(np.random.rand(5,5),columns=['a','b','c','d','e']).\
-
applymap(lambda x: int(x*10))
-
file=r"D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\random.csv"
-
df.to_csv(file,index=False)
-
# 再读取csv文件
-
monthlySales = spark.read.csv(file, header=True, inferSchema=True)
-
monthlySales.show()
2.5. 读取MySQL
-
# 此时需要将mysql-jar驱动放到spark-2.2.0-bin-hadoop2.7\jars下面
-
# 单机环境可行,集群环境不行
-
# 重新执行
-
df = spark.read.format('jdbc').options(
-
url='jdbc:mysql://127.0.0.1',
-
dbtable='mysql.db',
-
user='root',
-
password='123456'
-
).load()
-
df.show()
-
# 也可以传入SQL语句
-
sql="(select * from mysql.db where db='wp230') t"
-
df = spark.read.format('jdbc').options(
-
url='jdbc:mysql://127.0.0.1',
-
dbtable=sql,
-
user='root',
-
password='123456'
-
).load()
-
df.show()
2.6. 从pandas.dataframe创建
-
# 如果不指定schema则用pandas的列名
-
df = pd.DataFrame(np.random.random((4,4)))
-
spark_df = spark.createDataFrame (df,schema=['a','b','c','d'])
2.7. 从列式存储的parquet读取
-
# 读取example下面的parquet文件
-
file=r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\users.parquet"
-
df=spark.read.parquet(file)
-
df.show()
2.8. 从hive读取
-
# 如果已经配置spark连接hive的参数,可以直接读取hive数据
-
spark = SparkSession \
-
.builder \
-
.enableHiveSupport() \
-
.master("172.31.100.170:7077") \
-
.appName("my_first_app_name") \
-
.getOrCreate()
-
df=spark.sql("select * from hive_tb_name")
-
df.show()
3. 保存数据
3.1. 写到csv
-
# 创建dataframe
-
import numpy as np
-
df = pd.DataFrame(np.random.random((4, 4)),columns=['a', 'b', 'c', 'd'])
-
spark_df = spark.createDataFrame(df)
-
# 写到csv
-
file=r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.csv"
-
spark_df.write.csv(path=file, header=True, sep=",", mode='overwrite')
3.2. 保存到parquet
-
# 创建dataframe
-
import numpy as np
-
df = pd.DataFrame(np.random.random((4, 4)),columns=['a', 'b', 'c', 'd'])
-
spark_df = spark.createDataFrame(df)
-
# 写到parquet
-
file=r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.parquet"
-
spark_df.write.parquet(path=file,mode='overwrite')
3.3. 写到hive
-
# 打开动态分区
-
spark.sql("set hive.exec.dynamic.partition.mode = nonstrict")
-
spark.sql("set hive.exec.dynamic.partition=true")
-
# 使用普通的hive-sql写入分区表
-
spark.sql("""
-
insert overwrite table ai.da_aipurchase_dailysale_hive
-
partition (saledate)
-
select productid, propertyid, processcenterid, saleplatform, sku, poa, salecount, saledate
-
from szy_aipurchase_tmp_szy_dailysale distribute by saledate
-
""")
-
# 或者使用每次重建分区表的方式
-
jdbcDF.write.mode("overwrite").partitionBy("saledate").insertInto("ai.da_aipurchase_dailysale_hive")
-
jdbcDF.write.saveAsTable("ai.da_aipurchase_dailysale_hive", None, "append", partitionBy='saledate')
-
# 不写分区表,只是简单的导入到hive表
-
jdbcDF.write.saveAsTable("ai.da_aipurchase_dailysale_for_ema_predict", None, "overwrite", None)
3.4. 写到hdfs
-
# 数据写到hdfs,而且以csv格式保存
-
jdbcDF.write.mode("overwrite").options(header="true").csv("/home/ai/da/da_aipurchase_dailysale_for_ema_predict.csv")
3.5. 写到mysql
-
# 会自动对齐字段,也就是说,spark_df 的列不一定要全部包含MySQL的表的全部列才行
-
# overwrite 清空表再导入
-
spark_df.write.mode("overwrite").format("jdbc").options(
-
url='jdbc:mysql://127.0.0.1',
-
user='root',
-
password='123456',
-
dbtable="test.test",
-
batchsize="1000",
-
).save()
-
# append 追加方式
-
spark_df.write.mode("append").format("jdbc").options(
-
url='jdbc:mysql://127.0.0.1',
-
user='root',
-
password='123456',
-
dbtable="test.test",
-
batchsize="1000",
-
).save()