构建大数据环境脚本

#!/usr/bin/env bash

# 构建大数据环境

sudo chown -R ht /home/ht
sudo chgrp -R ht /home/ht

#
# Update & install dependencies
#
sudo apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" upgrade
sudo apt-get install -y zip unzip curl bzip2 python-dev build-essential git libssl1.0.0 libssl-dev \
    software-properties-common debconf-utils apt-transport-https

#
# Uncomment below to install Oracle Java8 (No longer available from ppa)
#

# sudo add-apt-repository -y ppa:webupd8team/java
# sudo apt-get update
# echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | sudo debconf-set-selections
# sudo apt-get install -y oracle-java8-installer oracle-java8-set-default
# cd /var/lib/dpkg/info
# sudo sed -i 's|JAVA_VERSION=8u151|JAVA_VERSION=8u162|' oracle-java8-installer.*
# sudo sed -i 's|PARTNER_URL=http://download.oracle.com/otn-pub/java/jdk/8u151-b12/e758a0de34e24606bca991d704f6dcbf/|PARTNER_URL=http://download.oracle.com/otn-pub/java/jdk/8u162-b12/0da788060d494f5095bf8624735fa2f1/|' oracle-java8-installer.*
# sudo sed -i 's|SHA256SUM_TGZ="c78200ce409367b296ec39be4427f020e2c585470c4eed01021feada576f027f"|SHA256SUM_TGZ="68ec82d47fd9c2b8eb84225b6db398a72008285fafc98631b1ff8d2229680257"|' oracle-java8-installer.*
# sudo sed -i 's|J_DIR=jdk1.8.0_151|J_DIR=jdk1.8.0_162|' oracle-java8-installer.*
# echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | sudo debconf-set-selections
# sudo apt-get install -y oracle-java8-installer oracle-java8-set-default

sudo add-apt-repository ppa:openjdk-r/ppa
sudo apt-get update
sudo apt-get install -y openjdk-8-jdk

export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64" | sudo tee -a /home/ht/.bash_profile

#
# Install Miniconda
#
curl -Lko /tmp/Miniconda3-latest-Linux-x86_64.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x /tmp/Miniconda3-latest-Linux-x86_64.sh
/tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /home/ht/anaconda

export PATH=/home/ht/anaconda/bin:$PATH
echo 'export PATH=/home/ht/anaconda/bin:$PATH' | sudo tee -a /home/ht/.bash_profile

sudo chown -R ht /home/ht/anaconda
sudo chgrp -R ht /home/ht/anaconda

#
# Install Clone repo, install Python dependencies
#
# cd /home/ht
# git clone https://github.com/rjurney/Agile_Data_Code_2
cd /home/ht/Agile_Data_Code_2
export PROJECT_HOME=/home/ht/Agile_Data_Code_2
echo "export PROJECT_HOME=/home/ht/Agile_Data_Code_2" | sudo tee -a /home/ht/.bash_profile

conda install -y python=3.5
conda install -y iso8601 numpy scipy scikit-learn matplotlib ipython jupyter
pip install bs4 Flask beautifulsoup4 frozendict geopy kafka-python py4j pymongo pyelasticsearch requests selenium tabulate tldextract wikipedia findspark imongo-kernel

sudo chown -R ht /home/ht/Agile_Data_Code_2
sudo chgrp -R ht /home/ht/Agile_Data_Code_2
cd /home/ht

# Install commons-httpclient
# curl -Lko /home/ht/Agile_Data_Code_2/lib/commons-httpclient-3.1.jar http://central.maven.org/maven2/commons-httpclient/commons-httpclient/3.1/commons-httpclient-3.1.jar

#
# Install Hadoop
#
# curl -Lko /tmp/hadoop-3.0.1.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-3.0.1/hadoop-3.0.1.tar.gz
mkdir -p /home/ht/hadoop
cd /home/ht/
tar -xvf /tmp/hadoop-3.0.1.tar.gz -C hadoop --strip-components=1

echo "" >> /home/ht/.bash_profile
export HADOOP_HOME=/home/ht/hadoop
echo 'export HADOOP_HOME=/home/ht/hadoop' | sudo tee -a /home/ht/.bash_profile
export PATH=$PATH:$HADOOP_HOME/bin
echo 'export PATH=$PATH:$HADOOP_HOME/bin' | sudo tee -a /home/ht/.bash_profile
export HADOOP_CLASSPATH=$(hadoop classpath)
echo 'export HADOOP_CLASSPATH=$(hadoop classpath)' | sudo tee -a /home/ht/.bash_profile
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
echo 'export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop' | sudo tee -a /home/ht/.bash_profile

# Give to vagrant
echo "Giving hadoop to user vagrant ..." | tee -a $LOG_FILE
sudo chown -R ht /home/ht/hadoop
sudo chgrp -R ht /home/ht/hadoop

#
# Install Spark
#
echo "" | tee -a $LOG_FILE
echo "Downloading and installing Spark 2.2.1 ..." | tee -a $LOG_FILE
# curl -Lko /tmp/spark-2.2.1-bin-without-hadoop.tgz https://archive.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz
mkdir -p /home/ht/spark
cd /home/ht
tar -xvf /tmp/spark-2.2.1-bin-without-hadoop.tgz -C spark --strip-components=1

echo "" >> /home/ht/.bash_profile
echo "# Spark environment setup" | sudo tee -a /home/ht/.bash_profile
export SPARK_HOME=/home/ht/spark
echo 'export SPARK_HOME=/home/ht/spark' | sudo tee -a /home/ht/.bash_profile
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop/
echo 'export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop/' | sudo tee -a /home/ht/.bash_profile
export SPARK_DIST_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath`
echo 'export SPARK_DIST_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath`' | sudo tee -a /home/ht/.bash_profile
export PATH=$PATH:$SPARK_HOME/bin
echo 'export PATH=$PATH:$SPARK_HOME/bin' | sudo tee -a /home/ht/.bash_profile

# Have to set spark.io.compression.codec in Spark local mode
cp /home/ht/spark/conf/spark-defaults.conf.template /home/ht/spark/conf/spark-defaults.conf
echo 'spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec' | sudo tee -a /home/ht/spark/conf/spark-defaults.conf

# Give Spark 8GB of RAM, use Python3  配置spark
echo "spark.driver.memory 8g" | sudo tee -a $SPARK_HOME/conf/spark-defaults.conf
echo "spark.executor.cores 2" | sudo tee -a $SPARK_HOME/conf/spark-defaults.conf
echo "PYSPARK_PYTHON=python3" | sudo tee -a $SPARK_HOME/conf/spark-env.sh
echo "PYSPARK_DRIVER_PYTHON=python3" | sudo tee -a $SPARK_HOME/conf/spark-env.sh

# Setup log4j config to reduce logging output
cp $SPARK_HOME/conf/log4j.properties.template $SPARK_HOME/conf/log4j.properties
sed -i 's/INFO/ERROR/g' $SPARK_HOME/conf/log4j.properties

# Give to vagrant
sudo chown -R ht /home/ht/spark
sudo chgrp -R ht /home/ht/spark

#
# Install MongoDB and dependencies
#
sudo apt-get install -y mongodb
sudo mkdir -p /data/db
sudo chown -R mongodb /data/db
sudo chgrp -R mongodb /data/db

# run MongoDB as daemon
sudo systemctl start mongodb

# Get the MongoDB Java Driver
# echo "curl -sLko /home/ht/Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar https://oss.sonatype.org/content/repositories/releases/org/mongodb/mongo-java-driver/3.6.1/mongo-java-driver-3.6.1.jar"
# curl -sLko /home/ht/Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar https://oss.sonatype.org/content/repositories/releases/org/mongodb/mongo-java-driver/3.6.1/mongo-java-driver-3.6.1.jar
# 单独复制过去

# Install the mongo-hadoop project in the mongo-hadoop directory in the root of our project.
# curl -Lko /tmp/mongo-hadoop-r2.0.2.tar.gz https://github.com/mongodb/mongo-hadoop/archive/r2.0.2.tar.gz
mkdir /home/ht/mongo-hadoop
cd /home/ht
tar -xvzf /tmp/mongo-hadoop-r2.0.2.tar.gz -C mongo-hadoop --strip-components=1
rm -rf /tmp/mongo-hadoop-r2.0.2.tar.gz

# Now build the mongo-hadoop-spark jars
cd /home/ht/mongo-hadoop
./gradlew jar
cp /home/ht/mongo-hadoop/spark/build/libs/mongo-hadoop-spark-*.jar /home/ht/Agile_Data_Code_2/lib/
cp /home/ht/mongo-hadoop/build/libs/mongo-hadoop-*.jar /home/ht/Agile_Data_Code_2/lib/
cd /home/ht

# Now build the pymongo_spark package
cd /home/ht/mongo-hadoop/spark/src/main/python
python setup.py install
cp /home/ht/mongo-hadoop/spark/src/main/python/pymongo_spark.py /home/ht/Agile_Data_Code_2/lib/
export PYTHONPATH=$PYTHONPATH:$PROJECT_HOME/lib
echo "" | sudo tee -a /home/ht/.bash_profile
echo 'export PYTHONPATH=$PYTHONPATH:$PROJECT_HOME/lib' | sudo tee -a /home/ht/.bash_profile
cd /home/ht

rm -rf /home/ht/mongo-hadoop

#
# Install ElasticSearch in the elasticsearch directory in the root of our project, and the Elasticsearch for Hadoop package
#
echo "curl -sLko /tmp/elasticsearch-5.6.0.tar.gz https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.6.0.tar.gz"
# curl -sLko /tmp/elasticsearch-5.6.0.tar.gz https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.6.0.tar.gz
mkdir /home/ht/elasticsearch
cd /home/ht
tar -xvzf /tmp/elasticsearch-5.6.0.tar.gz -C elasticsearch --strip-components=1
sudo chown -R ht /home/ht/elasticsearch
sudo chgrp -R ht /home/ht/elasticsearch
sudo mkdir -p /home/ht/elasticsearch/logs
sudo chown -R ht /home/ht/elasticsearch/logs
sudo chgrp -R ht /home/ht/elasticsearch/logs

# Run elasticsearch
sudo -u vagrant /home/ht/elasticsearch/bin/elasticsearch -d # re-run if you shutdown your computer

# Run a query to test - it will error but should return json
echo "Testing Elasticsearch with a query ..." | tee -a $LOG_FILE
curl 'localhost:9200/agile_data_science/on_time_performance/_search?q=Origin:ATL&pretty'

# Install Elasticsearch for Hadoop
echo "curl -sLko /tmp/elasticsearch-hadoop-6.1.2.zip http://download.elastic.co/hadoop/elasticsearch-hadoop-6.1.2.zip"
# curl -sLko /tmp/elasticsearch-hadoop-6.1.2.zip http://download.elastic.co/hadoop/elasticsearch-hadoop-6.1.2.zip
unzip /tmp/elasticsearch-hadoop-6.1.2.zip
mv /home/ht/elasticsearch-hadoop-6.1.2 /home/ht/elasticsearch-hadoop
cp /home/ht/elasticsearch-hadoop/dist/elasticsearch-hadoop-6.1.2.jar /home/ht/Agile_Data_Code_2/lib/
cp /home/ht/elasticsearch-hadoop/dist/elasticsearch-spark-20_2.11-6.1.2.jar /home/ht/Agile_Data_Code_2/lib/
echo "spark.speculation false" | sudo tee -a /home/ht/spark/conf/spark-defaults.conf
rm -f /tmp/elasticsearch-hadoop-6.1.2.zip
rm -rf /home/ht/elasticsearch-hadoop/conf/spark-defaults.conf

#
# Spark jar setup
#

# Install and add snappy-java and lzo-java to our classpath below via spark.jars
echo "" | tee -a $LOG_FILE
echo "Installing snappy-java and lzo-java and adding them to our classpath ..." | tee -a $LOG_FILE
cd /home/ht/Agile_Data_Code_2
curl -sLko lib/snappy-java-1.1.7.1.jar http://central.maven.org/maven2/org/xerial/snappy/snappy-java/1.1.7.1/snappy-java-1.1.7.1.jar
curl -sLko lib/lzo-hadoop-1.0.5.jar http://central.maven.org/maven2/org/anarres/lzo/lzo-hadoop/1.0.5/lzo-hadoop-1.0.5.jar
cd /home/ht

# Set the spark.jars path
echo "spark.jars /home/ht/Agile_Data_Code_2/lib/mongo-hadoop-spark-2.0.2.jar,/home/ht/Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar,/home/ht/Agile_Data_Code_2/lib/mongo-hadoop-2.0.2.jar,/home/ht/Agile_Data_Code_2/lib/elasticsearch-spark-20_2.11-6.1.2.jar,/home/ht/Agile_Data_Code_2/lib/snappy-java-1.1.7.1.jar,/home/ht/Agile_Data_Code_2/lib/lzo-hadoop-1.0.5.jar,/home/ht/Agile_Data_Code_2/lib/commons-httpclient-3.1.jar" | sudo tee -a /home/ht/spark/conf/spark-defaults.conf

#
# Kafka install and setup
#
echo "" | tee -a $LOG_FILE
echo "" | tee -a $LOG_FILE
echo "Downloading and installing Kafka version 2.1.1 for Scala 2.11 ..." | tee -a $LOG_FILE
# curl -Lko /tmp/kafka_2.11-2.1.1.tgz https://www-us.apache.org/dist/kafka/2.1.1/kafka_2.11-2.1.1.tgz
mkdir -p /home/ht/kafka
cd /home/ht/
tar -xvzf /tmp/kafka_2.11-2.1.1.tgz -C kafka --strip-components=1 && rm -f /tmp/kafka_2.11-2.1.1.tgz

# Set the log dir to kafka/logs
sed -i '/log.dirs=\/tmp\/kafka-logs/c\log.dirs=logs' /home/ht/kafka/config/server.properties

# Give to vagrant
echo "Giving Kafka to user vagrant ..." | tee -a $LOG_FILE
sudo chown -R ht /home/ht/kafka
sudo chgrp -R ht /home/ht/kafka

# Set the log dir to kafka/logs
echo "Configuring logging for kafka to go into kafka/logs directory ..." | tee -a $LOG_FILE
sed -i '/log.dirs=\/tmp\/kafka-logs/c\log.dirs=logs' /home/ht/kafka/config/server.properties

# Run zookeeper (which kafka depends on), then Kafka
echo "Running Zookeeper as a daemon ..." | tee -a $LOG_FILE
sudo -H -u vagrant /home/ht/kafka/bin/zookeeper-server-start.sh -daemon /home/ht/kafka/config/zookeeper.properties
echo "Running Kafka Server as a daemon ..." | tee -a $LOG_FILE
sudo -H -u vagrant /home/ht/kafka/bin/kafka-server-start.sh -daemon /home/ht/kafka/config/server.properties

#
# Install and setup Airflow
#
echo "export SLUGIFY_USES_TEXT_UNIDECODE=yes"
export SLUGIFY_USES_TEXT_UNIDECODE=yes
pip install apache-airflow[hive]
mkdir /home/ht/airflow
mkdir /home/ht/airflow/dags
mkdir /home/ht/airflow/logs
mkdir /home/ht/airflow/plugins

sudo chown -R ht /home/ht/airflow
sudo chgrp -R ht /home/ht/airflow

airflow initdb
airflow webserver -D &
airflow scheduler -D &

# Install Apache Zeppelin
echo "curl -sLko /tmp/zeppelin-0.7.3-bin-all.tgz https://archive.apache.org/dist/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-all.tgz"
# curl -sLko /tmp/zeppelin-0.7.3-bin-all.tgz https://archive.apache.org/dist/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-all.tgz
mkdir zeppelin
tar -xvzf /tmp/zeppelin-0.7.3-bin-all.tgz -C zeppelin --strip-components=1

# Configure Zeppelin
cp zeppelin/conf/zeppelin-env.sh.template zeppelin/conf/zeppelin-env.sh
echo "export SPARK_HOME=$PROJECT_HOME/spark" >> zeppelin/conf/zeppelin-env.sh
echo "export SPARK_MASTER=local" >> zeppelin/conf/zeppelin-env.sh
echo "export SPARK_CLASSPATH=" >> zeppelin/conf/zeppelin-env.sh

# Jupyter server setup
jupyter notebook --generate-config
mkdir /root/.jupyter/
cp /home/ht/Agile_Data_Code_2/jupyter_notebook_config.py /root/.jupyter/
mkdir /root/certs
sudo openssl req -x509 -nodes -days 365 -newkey rsa:1024 -subj "/C=US" -keyout /root/certs/mycert.pem -out /root/certs/mycert.pem

cd /home/ht/Agile_Data_Code_2
jupyter notebook --ip=0.0.0.0 --NotebookApp.token= --allow-root --no-browser &
cd

# =======
sudo chown -R ht /home/ht/airflow
sudo chgrp -R ht /home/ht/airflow

echo "sudo chown -R ht /home/ht/airflow" | sudo tee -a /home/ht/.bash_profile
echo "sudo chgrp -R ht /home/ht/airflow" | sudo tee -a /home/ht/.bash_profile

# Install Ant to build Cassandra
sudo apt-get install -y ant

# Install Cassandra - must build from source as the latest 3.11.1 build is broken...
git clone https://github.com/apache/cassandra
cd cassandra
git checkout cassandra-3.11
ant
bin/cassandra
export PATH=$PATH:/home/ht/cassandra/bin
echo 'export PATH=$PATH:/home/ht/cassandra/bin' | sudo tee -a /home/ht/.bash_profile
cd ..

# Install and setup JanusGraph  下载不下来
# cd /home/ht
# curl -Lko /tmp/janusgraph-0.2.0-hadoop2.zip \
#   https://github.com/JanusGraph/janusgraph/releases/download/v0.2.0/janusgraph-0.2.0-hadoop2.zip
# unzip -d . /tmp/janusgraph-0.2.0-hadoop2.zip
# mv janusgraph-0.2.0-hadoop2 janusgraph
# rm /tmp/janusgraph-0.2.0-hadoop2.zip

# Download data
cd /home/ht/Agile_Data_Code_2
./download.sh

# Install phantomjs
/home/ht/Agile_Data_Code/install/phantomjs.sh

# make sure we own /home/ht/.bash_profile after all the 'sudo tee'
sudo chgrp vagrant /home/ht/.bash_profile
sudo chown vagrant /home/ht/.bash_profile

#
# Cleanup
#
# sudo apt-get clean
# sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

echo "DONE!"

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
构建航班大数据仓库是一个复杂的过程,涉及到数据收集、清洗、存储和分析等多个步骤。下面是航班大数据仓库构建的一般步骤: 1. 数据需求分析:首先,需要明确航班大数据仓库的目标和需求。确定需要收集哪些数据,以及对数据的分析和查询需求。 2. 数据收集:收集航空公司、机场、航班计划、实际航班数据等相关数据。这些数据可能来自多个来源和格式,需要进行整合和转换。 3. 数据清洗和转换:对收集到的数据进行清洗,去除重复、缺失或错误的数据。根据数据仓库的模型设计,进行格式转换、字段映射等操作。 4. 数据存储:选择合适的存储技术来存储航班大数据,常见的选择包括关系型数据库、分布式文件系统(如Hadoop HDFS)或列式数据库(如HBase)等。 5. 数据建模:设计合适的数据模型来组织和管理航班大数据。可以采用维度建模(如星型模型或雪花模型)或实体关系建模等方法。 6. 数据加载:将清洗后的数据加载到数据仓库中。可以使用ETL(Extract, Transform, Load)工具来自动化这个过程,或编写脚本进行数据导入。 7. 数据索引和优化:根据查询需求,对数据仓库进行索引和性能优化,以提高查询效率和响应时间。 8. 数据分析和查询:使用合适的工具(如SQL查询、OLAP分析等)对航班大数据进行查询和分析。可以结合数据可视化工具,将分析结果以图表、仪表盘等形式展示出来。 9. 数据维护和更新:定期进行数据维护和更新,确保数据的准确性和完整性。 总结起来,航班大数据仓库的构建包括数据需求分析、数据收集、清洗和转换、数据存储、数据建模、数据加载、数据索引和优化、数据分析和查询,以及数据维护和更新等多个步骤。通过这样的构建过程,可以有效地管理和分析航班相关的大数据,为航空业提供决策支持和业务改进的依据。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值