搭建spark环境
一:创建虚机
Linux设置网卡IP地址
vi /etc/sysconfig/network-scripts/ifcfg-ens33
BOOTPROTO=static
ONBOOT=yes
IPADDR=20.0.0.100
NETMASK=255.255.255.0
GATEWAY=20.0.0.2
DNS1=8.8.8.8
重启网卡
systemctl restart network
检查
ping www.baidu.com
二:搭建spark基础环境
将从spark官网下载的spark安装包上传到 虚机 /opt 安装目录下,进入 /opt 解压
tar -zxvf spark-3.0.3-bin-hadoop3.2.tgz
#!/bin/bash
#1 获取输入参数个数,如果个数为0,直接退出命令
paramnum=$#
echo "paramnum:$paramnum"
if ((paramnum==0)); then
echo no params;
exit;
fi
# 2 根据传入参数获取文件名称
p1=$1
file_name=`basename $p1`
echo fname=$file_name
# 3 获取输入数据的绝对路径
pdir=`cd -P $(dirname $p1); pwd`
echo pdir=$pdir
# 4 获取用户名称
user=`whoami`
# 5 循环执行rsync
current=`hostname`
nodes=$(cat /root/bin/works)
for host in $nodes; do
echo ==================$host==================
if [ "$host" != "$current" ];then
rsync -rvl $pdir/$file_name $user@$host:$pdir
fi
done
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop01:8020</value>
</property>
<!-- 设置Hadoop本地保存数据路径 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/bigdata/data/hadoop</value>
</property>
<!-- 设置HDFS web UI用户身份 -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<!-- 整合hive用户代理设置 -->
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<!-- 设置SNN进程运行机器位置信息 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop02:50090</value>
</property>
<!-- 设置MR程序默认运行模式:yarn集群模式 local本地模式 -->
<property>
<name>mapreduce.framework.name></name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<!-- 设置YARN集群主角色运行机器位置 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop01</value>
</property>
<property>
<name>yarn.nodemanager.anx-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 是否将对容器实施物理内存限制 -->
<property>
<name>yarn.nodemanager.pmen-check-enabled</name>
<value>false</value>
</property>
<!-- 是否对容器实施虚拟内存限制 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>false</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 设置yarn历史服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://hadoop01:19888/jobhistory/logs</value>
</property>
<!-- 历史日志缓存的时间 7天-->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop02:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop02:19888</value>
</property>
hadoop jar /bigdata/server/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.3.jar wordcount /input /output
package cn.wolfcode.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import java.nio.charset.StandardCharsets;
public class UrlHandlerUdf extends GenericUDF {
@Override //初始化方法,完成对参数的校验
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws
UDFArgumentException {
// 参数长度判断
if(objectInspectors.length != 1){
throw new UDFArgumentLengthException("传入的数据参数的长度不正确!");
}
// 判断输入参数的类型
if(!objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)){
throw new UDFArgumentTypeException(0,"输入的参数类型不正确!!!");
}
return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
}
@Override //进行数据的处理,返回一个结果
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
if(deferredObjects[0].get() == null){
return "" ;
}
String data = deferredObjects[0].get().toString();
int index = data.indexOf("?");
if(index > 0 ){
data = data.substring(0,index);
}
if (data.startsWith("https://")){
data=data.replaceFirst("https://","http://");
}
return new Text(data.getBytes(StandardCharsets.UTF_8));
}
@Override
public String getDisplayString(String[] strings) {
return "";
}
}
package cn.wolfcode.udf;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import redis.clients.jedis.Jedis;
import java.nio.charset.StandardCharsets;
import java.util.*;
public class Ip2Loc extends GenericUDF {
public static List<Map<String,String>> mapList = new ArrayList<>();
static {
String host = "20.0.0.100";
int port = 6379;
Jedis jedis = new Jedis(host, port);
Set<String> areas = jedis.smembers("areas");
for (String area : areas) {
JSONObject jsonObject = JSON.parseObject(area);
Map<String,String> map = new HashMap<>();
map.put("city",jsonObject.getString("city"));
map.put("province",jsonObject.getString("province"));
mapList.add(map);
}
// 把map数据写入到文件
}
// 初始化参数判断
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws
UDFArgumentException {
// 参数长度判断
if(objectInspectors.length != 1){
throw new UDFArgumentLengthException("传入的数据参数的长度不正确!");
}
// 判断输入参数的类型
if(!objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)){
throw new UDFArgumentTypeException(0,"输入的参数类型不正确!!!");
}
return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
}
// 返回一个字符串 广东省_广州市
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
if(deferredObjects[0].get() == null){
return "" ;
}
int index = (int) (Math.random() * mapList.size());
Text new_str = new Text((mapList.get(index).get("city")+"_"+
(mapList.get(index).get("province"))).getBytes(StandardCharsets.UTF_8));
return new_str;
}
@Override
public String getDisplayString(String[] strings) {
return "";
}
public static void main2(String[] args) throws Exception{
int index = (int)(Math.random() * mapList.size());
System.out.println(mapList.get(index).get("city") + "_" + mapList.get(index).get("province"));
}
}
package com.game.connector
import org.apache.spark.sql.SparkSession
object MysqlToHive {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir","D:\\graduate\\hadoop\\hadoop-2.7.6")
val spark: SparkSession = SparkSession.builder().master("local[*]").enableHiveSupport().getOrCreate()
// MySQL JDBC连接参数
val jdbcUsername = "root"
val jdbcPassword = "123456"
val jdbcUrl = "jdbc:mysql://20.0.0.100:3306/demo?useUnicode=true&serverTimezone=UTC"
val jdbcTable = "user"
// 读取MySQL数据
val userOriginData = spark.read
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", jdbcTable)
.option("user", jdbcUsername)
.option("password", jdbcPassword)
.load()
userOriginData.createOrReplaceTempView("user_table")
//用户数据清洗
val userTransformSql =
"""
|select
| coalesce(user_id,0) as user_id,coalesce(user_name,'') as user_name,coalesce(sex,-1) as sex
|from user_table
|""".stripMargin
val userResultData = spark.sql(userTransformSql)
userResultData.write.mode("overwrite")
.format("Hive")
.insertInto("dim.t_dim_user")
val gameOriginData = spark.read
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "game")
.option("user", jdbcUsername)
.option("password", jdbcPassword)
.load()
//游戏项目数据清洗
gameOriginData.createOrReplaceTempView("game_table")
//数据清洗
val gameTransformSql =
"""
|select
| coalesce(game_id,0) as game_id,coalesce(game_name,'') as game_name,coalesce(event_time,'1970-01-01 00:00:00') as event_time
|from game_table
|""".stripMargin
val gameResultData = spark.sql(gameTransformSql)
gameResultData.show()
gameResultData.write.mode("overwrite")
.format("Hive")
.insertInto("dim.t_dim_game")
spark.stop()
}
}
package com.game.connector
import org.apache.spark.sql.SparkSession
object FileToHive {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir","D:\\graduate\\hadoop\\hadoop-2.7.6")
val spark: SparkSession = SparkSession.builder().master("local[*]").enableHiveSupport().getOrCreate()
//采集本地文件json数据
val originData = spark.read.format("json")
.option("encoding", "UTF-8")
.load("file:///D://数仓/项目/game-spark/src/test/gamedata.json")
originData.createOrReplaceTempView("origin_table")
//数据清洗
val transformSql =
"""
|select
| date_format(coalesce(event_time,'1970-01-01 00:00:00'), 'yyyy-MM-dd') as event_date,
| coalesce(event_time,'1970-01-01 00:00:00') as event_time,coalesce(user_id,0) as user_id,
| coalesce(game_id,0) as game_id,coalesce(event_name,'') as event_name,
| coalesce(payment_amount,0) as payment_amount,coalesce(user_name,'') as user_name,coalesce(game_name,'') as game_name
|from origin_table
|""".stripMargin
val resultData = spark.sql(transformSql)
resultData.show(10)
resultData.write.mode("overwrite")
.format("Hive")
.insertInto("dwd.t_dwd_user_event")
spark.stop()
}
}
package com.game.process
import org.apache.spark.sql.SparkSession
object gameBaseAds {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir","D:\\graduate\\hadoop\\hadoop-2.7.6")
val spark: SparkSession = SparkSession.builder().master("local[*]").enableHiveSupport().getOrCreate()
val originData = spark.table("dws.t_dws_game_base")
val gameData = spark.table("dim.t_dim_game")
originData.createOrReplaceTempView("user_base")
gameData.createOrReplaceTempView("t_dim_game")
val caculateSql =
"""
|select
| a.event_date,a.game_id,a.game_name,b.event_time as game_create_time,
| a.dau,a.reg_num,a.pay_fee
|from user_base a
|left join t_dim_game b on a.game_id=b.game_id
|""".stripMargin
originData.show()
val resultData = spark.sql(caculateSql)
resultData.show()
resultData.write.mode("overwrite")
.format("Hive")
.insertInto("ads.t_ads_game_base")
spark.stop()
}
}
package com.game.process
import org.apache.spark.sql.SparkSession
object gameBaseDws {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir","D:\\graduate\\hadoop\\hadoop-2.7.6")
val spark: SparkSession = SparkSession.builder().master("local[*]").enableHiveSupport().getOrCreate()
val originData = spark.table("dwd.t_dwd_user_event")
originData.createOrReplaceTempView("user_event")
val caculateSql =
"""
|select
| event_date,game_id,max(game_name) as game_name,
| count(distinct case when event_name='user_login' then user_id end) as dau,
| count(distinct case when event_name='user_reg' then user_id end ) as reg_num,
| sum(case when event_name='user_orders' then paid_amount end) as pay_fee
|from user_event
|group by event_date,game_id
|""".stripMargin
originData.show()
val resultData = spark.sql(caculateSql)
resultData.show()
resultData.write.mode("overwrite")
.format("Hive")
.insertInto("dws.t_dws_game_base")
spark.stop()
}
}