spark读取数据源
Spark读取Hiveserver2数据源
环境信息
- hiveserver2地址
- 用户名
- 密码
代码
def main(args: Array[String]): Unit = {
val Array(url, database, table, username, password) = args
val sparkConf = new SparkConf().setAppName("Spark Mysql Demo (Scala)")
val spark: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
Dataset<Row> rowDataset = sparkSession.read()
.format("jdbc")
.option("url", url)
.option("dbtable", "$database.$table")
.option("user", username)
.option("password", password)
.option("driver", "org.apache.hive.jdbc.HiveDriver").load().filter("`table_name.day`='20210112'");
rowDataset.show();
spark.stop()
}
Spark读取mysql数据
环境信息
- mysql地址
- 用户名
- 密码
- 库表
代码
def main(args: Array[String]): Unit = {
val Array(url, username, password, table) = args
val sparkConf = new SparkConf().setAppName("Spark Mysql Demo (Scala)")
val spark: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val props = new Properties()
props.setProperty("user", username)
props.setProperty("password", password)
val df: DataFrame = spark.read.jdbc(url, table, props)
val rowNumbers: Long = df.count()
println("数据总条数据: " + rowNumbers)
// select 里的参数位为 mysql 的字段
df.select("id").where("id >= 3").show()
// 写入数据
// df.write.mode(SaveMode.Append).jdbc(url,"tb_02",props)
spark.stop()
}
spark读取kafka
环境信息
- brokers地址
- topics信息
- 消费组id
代码
def main(args: Array[String]): Unit = {
val Array(brokers, topics, interval, groupId) = args
val sparkConf = new SparkConf().setAppName("Spark Kafka Demo (Scala)")
val ssc = new StreamingContext(sparkConf, Seconds(interval.toInt))
// kafka参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
// 消息
val messages = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(topics), kafkaParams)
)
// 单词统计
val lines = messages.map(_.value)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
wordCounts.print()
// Start the computation
ssc.start()
ssc.awaitTermination()
}
spark读取hdfs数据
环境信息
- 源地址
- 读取的分区数
- 目标地址
代码
def main(args: Array[String]): Unit = {
val Array(src, partition, dest) = args
val sparkConf: SparkConf = new SparkConf().setAppName("Spark HDFS Demo (Scala)")
// 1、创建session
val session: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
// 2、依据sc创建rdd
val sc: SparkContext = session.sparkContext
val file: RDD[String] = sc.textFile(src, partition)
file.saveAsTextFile(dest)
session.stop()
}
spark读取Hbase数据源
环境信息
- zk的地址
- zk上的hbase设置的rootDir
- hbase的master地址
- table
代码
def main(args: Array[String]): Unit = {
val Array(zookeeper, rootdir, master, table) = args
val sparkConf: SparkConf = new SparkConf().setAppName("Spark HBase Demo (Scala)")
// 支持 hive 读写
val spark: SparkSession = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
val hbaseConfig: Configuration = HBaseConfiguration.create()
hbaseConfig.set("hbase.zookeeper.quorum",zookeeper)
hbaseConfig.set("hbase.rootdir", rootdir)
hbaseConfig.set("hbase.master", master)
// 设置查询的表名
hbaseConfig.set(TableInputFormat.INPUT_TABLE, table)
val stuRDD: RDD[(ImmutableBytesWritable, Result)] = spark.sparkContext.newAPIHadoopRDD(hbaseConfig, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count: Long = stuRDD.count()
println("Students RDD Count:" + count)
stuRDD.cache()
// 遍历输出
stuRDD.foreach({ case (_, result) =>
val key: String = Bytes.toString(result.getRow)
println("Row key:" + key)
})
spark.stop()
}
spark读取es数据源
环境信息
- es的用户名
- es的密码
- es的服务地址
- es的clusterName,主要是集群权限给关了。需要自己指定
- es的index
代码
def main(args: Array[String]): Unit = {
val Array(user, password, esIp, clusterName, index) = args
val sparkConf: SparkConf = new SparkConf().setAppName("Spark Es Demo (Scala)").setMaster("local[*]")
sparkConf.set("cluster.name", clusterName)
sparkConf.set("es.internal.es.cluster.name", clusterName)
sparkConf.set("es.internal.es.version", "7.12") // 防止 security_exception: action [cluster:monitor/main] is unauthorized
sparkConf.set("es.index.auto.create", "true")
sparkConf.set("es.nodes", esIp)
sparkConf.set("es.port", "9200")
sparkConf.set("es.mapping.date.rich", "false")
sparkConf.set("es.index.read.missing.as.empty", "true")
sparkConf.set("es.net.http.auth.user", user) //访问es的用户名
sparkConf.set("es.net.http.auth.pass", password) //访问es的密码
sparkConf.set("es.nodes.wan.only", "true")
sparkConf.set("es.index.read.allow.red.status", "true") // 防止 security_exception: action [cluster:monitor/health] is unauthorized
val sc = new SparkContext(sparkConf)
write2Es(sc, index)
read2Es(sc, index)
sc.stop()
}
def write2Es(sc: SparkContext, index: String): Unit = {
val numbers: Map[String, String] = Map("jsIp" -> "11111",
"address" -> "11111", "enterprise" -> "北京",
"xian" -> "11111", "ip" -> "11111",
"source" -> "11111", "sheng" -> "11111",
"phone" -> "11111", "shi" -> "11111",
"ipLong" -> "333", "time" -> "2022-12-27 09:56:50",
"qsIp" -> "11111", "contacts" -> "11111",
"email" -> "11111@163.com")
val rdd: RDD[Map[String, Any]] = sc.makeRDD(Seq(numbers))
EsSpark.saveToEs(rdd, s"${index}/_doc")
println("--------------------End-----------------")
}
def read2Es(sc: SparkContext, index: String): Unit = {
val rdd: RDD[(String, collection.Map[String, AnyRef])] = EsSpark.esRDD(sc, s"${index}/_doc")
println("------------------rdd.count():" + rdd.count())
rdd.foreach(line => {
val key: String = line._1
val value: collection.Map[String, AnyRef] = line._2
println("------------------key:" + key)
for (tmp <- value) {
val key1: String = tmp._1
val value1: AnyRef = tmp._2
println("------------------key1:" + key1)
println("------------------value1:" + value1)
}
})
}
配置maven依赖
<properties>
<scala.version>2.12</scala.version>
<spark.version>3.2.1</spark.version>
<hadoop.version>3.3.1</hadoop.version>
<jackson.version>2.12.3</jackson.version>
<s3.version>1.12.77</s3.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>${s3.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>7.10.2</version>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-30_${scala.version}</artifactId>
<version>7.12.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-*</artifactId>
</exclusion>
</exclusions>
</dependency>-->
<dependency>
<groupId>org.apache.hbase.connectors.spark</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.0.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.2</version>
</dependency>
</dependencies>
希望对正在查看文章的您有所帮助,记得关注、评论、收藏,谢谢您