一、Hadoop分布式实例
在hdfs上创建hadoop用户,及创建input文件夹
./bin/hdfs dfs -mkdir -p /user/hadoop
./bin/hdfs dfs -mkdir ./input
上传文件并查看
./bin/hdfs dfs -put /home/hadoop/下载/*.txt input
./bin/hdfs dfs -ls
三表合一
# 读取文件
var a = sc.textFile("input/result_bigdata.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
var b = sc.textFile("input/result_math.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
# 联合a、b两表
var a_b = a union b
# 降阶操作
var total_score = a_b.map(x=>(x._1,x._3)).reduceByKey((x,y)=>x+y)
# 求平均数
var average = total_score.map{x=>(x._1,x._2/2)}
# 数学成绩
var math_score = b.map{x=>(x._1,x._3)}
# bigdata成绩
var bigdata_score = a.map{x=>(x._1,x._3)}
# 读取学生表
var c = sc.textFile("input/student.txt").map{x=>val line=x.split("\t");(line(0),line(1))}
# 联合数学成绩和bigdata成绩
var score1 = math_score.join(bigdata_score)
# 联合平均分和总分
var score2 = total_score.join(average)
# 联合前面两表
var score = score1.join(score2)
# 将学生表和最终成绩表联合
var flag = c.join(score)
# 看一下效果
flag.collect
var a = sc.textFile("input/result_bigdata.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
var b = sc.textFile("input/result_math.txt").map{x=>val line=x.split("\t");(line(0),line(1),line(2).toInt)}
var a_b = a union b
var total_score = a_b.map(x=>(x._1,x._3)).reduceByKey((x,y)=>x+y)
var average = total_score.map{x=>(x._1,x._2/2)}
var math_score = b.map{x=>(x._1,x._3)}
var bigdata_score = a.map{x=>(x._1,x._3)}
var c = sc.textFile("input/student.txt").map{x=>val line=x.split("\t");(line(0),line(1))}
var score1 = math_score.join(bigdata_score)
var score2 = total_score.join(average)
var score = score1.join(score2)
var flag = c.join(score)
flag.collect
二、spark支持hive的版本
问题:
解决方案:
资料:Spark入门:连接Hive读写数据(DataFrame)
安装编译后的spark版本
sudo tar -zxf ~/下载/spark-2.1.0-bin-h27hive.tgz -C /usr/local
修改文件名
sudo mv /usr/local/spark-2.1.0-bin-h27hive /usr/local/sparkwithhive
修改文件权限
sudo chown -R hadoop:hadoop /usr/local/sparkwithhive
cd /usr/local/sparkwithhive/
cp ./conf/spark-env.sh.template ./conf/spark-env.sh
配置spark
vim ./conf/spark-env.sh
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop-2.7.1/bin/hadoop classpath)
启动spark的命令环境
cd /usr/local/sparkwithhive
./bin/spark-shell
三、安装Hive
解压安装目录
sudo tar -zxvf ./下载/apache-hive-1.2.2-bin.tar.gz -C /usr/local
将文件夹名改为hive,并修改权限
sudo mv /usr/local/apache-hive-1.2.2-bin /usr/local/hive
sudo chown -R hadoop /usr/local/hive
修改/usr/local/hive/bin/hive(不清不楚的,看参考资料)
vim /usr/local/hive/bin/hive
如后面截图,找到对应位置修改
sparkAssemblyPath=`ls ${SPARK_HOME}/jars/*.jar`
编辑配置环境
vim ~/.bashrc
export HIVE_HOME=/usr/local/hive
export PATH=$PATH:$HIVE_HOME/bin
立即激活环境
source ~/.bashrc
cp /usr/local/hive/conf/hive-env.sh.template /usr/local/hive/conf/hive-env.sh
配置hive文件
vim /usr/local/hive/conf/hive-env.sh
export HADOOP_HOME=/usr/local/hadoop-2.7.1
立即激活它
source /usr/local/hive/conf/hive-env.sh
修改文件名
sudo mv /usr/local/hive/conf/hive-default.xml.template /usr/local/hive/conf/hive-default.xml
创建hive-site.xml文件
touch /usr/local/hive/conf/hive-site.xml
编辑
vim /usr/local/hive/conf/hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
<description>password to use against metastore database</description>
</property>
</configuration>
四、配置mysql
sudo apt-get install mysql-server
不清楚上下两步的区别
解压mysql
cd 下载
tar -zxvf mysql-connector-java-5.1.46.tar.gz
cp mysql-connector-java-5.1.46/mysql-connector-java-5.1.46-bin.jar /usr/local/hive/lib
service mysql start
登入mysql
mysql -u root -p
创建数据库
create database hive;
#将所有数据库的所有表的所有权限赋给hadoop用户,后面的hive是配置hive-site.xml中配置的连接密码
grant all on *.* to hadoop@localhost identified by 'hive';
刷新sql的权限
flush privileges;
在后续的编码中还需配置spark的文件
vim /usr/local/sparkwithhive/conf/spark-env.sh
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop-2.7.1/bin/hadoop classpath)
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_162
export CLASSPATH=$CLASSPATH:/usr/local/hive/lib
export SCALA_HOME=/usr/local/scala-2.11.8
export HADOOP_CONF_DIR=/usr/local/hadoop-2.7.1/etc/hadoop
export HIVE_CONF_DIR=/usr/local/hive/conf
export SPARK_CLASSPATH=$SPARK_CLASSPATH:/usr/local/hive/lib/mysql-connector-java-5.1.46-bin.jar
将hive的site文件拷贝到spark的conf目录下
cp /usr/local/hive/conf/hive-site.xml /usr/local/sparkwithhive/conf/
启动hadoop集群
/usr/local/hadoop-2.7.1/sbin/start-all.sh
进入hive,哪个目录执行都行
hive