import datetime
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col, count, avg,round
from pyspark.sql.types import Row, StructField, StructType, StringType, DoubleType, IntegerType, LongType
获取rdd算子
conf = SparkConf().setAppName("wordcount").setMaster("yarn")
sc = SparkContext(conf=conf)
rdd1 = sc.textFile(目录, 2)
#第二种,多用于处理完rdd后转化df
spark = SparkSession.builder\
.appName("wordcount_sparksql")\
.master("local[2]")\
.config("spark.sql.shuffle.partitions",2)\
.getOrCreate()
rdd1 = spark.sparkContext.textFile("/spark/spark_sql/u.data")
获取SparkSession对象和dateFrame对象
#创建SparkSession对象
spark = SparkSession.builder.appName(程序名字).master(模式).config().getOrcreate()
#读取mysql表
url = "jdbc:mysql://node1.itcast.cn:3306/serverTimezone=UTC&characterEncoding=utf8&useUnicode=true"
table_info = "db_company.emp"
prop = {'user': 'root', 'password': '123456', 'driver': 'com.mysql.jdbc.Driver'}
input_df = spark.read.jdbc(传入上面参数)
#第一种方式读取
input_df1 = spark.read.text(目录名)
#第二种方式读取
input_df2 = spark.read.format("json").load("目录名")#yarn模式下默认是hdfs目录
#第三种方式读取
input_df3 = spark.read.load(path="",format="parquet")
其中csv文件需要特殊处理,指定表头
my_schema = StructType([StructField("name",StringType(),True),StructField("age",IntegerType(),True)])
#第二种方式
my_schema = "name string, age int"
input_df4 = spark.read.csv(path="",schema=my_schema)
input_df4.printSchema()
input_df4.show()
读取hive对象
spark = SparkSession \
.builder \
.appName("SparkSQLAppName") \
.master("local[2]") \
.config("spark.sql.shuffle.partitions", 2) \
.config("spark.sql.warehouse.dir",
'hdfs://node1.itcast.cn:8020/user/hive/warehouse')\
.config("hive.metastore.uris", "thrift://node1.itcast.cn:9083")\
.enableHiveSupport()\
.getOrCreate()
其中hive设定直接读取目录,可以不用进行注册成表,而是直接sql使用,其余需要进行如下操作
dataFrame1.createOrReplaceTempView("t1")
dataFrame1 = spark.sql("""
select deptno,round(avg(sal),2) as sal_avg from db_hive.emp group by deptno order by sal_avg desc
""")
spark的写入方式
dataFrame3.write\
.mode("overwrite")\
.format("csv")\
.option("sep", "\t")\
.save("/spark/spark_sql/output")
--append: 追加模式,当数据存在时,继续追加
--overwrite: 覆写模式,当数据存在时,覆写以前数据,存储当前最新数据;
--error/errorifexists: 如果目标存在就报错,默认的模式
--ignore: 忽略,数据存在时不做任何操作