收集数据到Hadoop hdfs
使用ETL(MapReduce)进行数据清洗
(更新元数据 target)
Hive 关联外部表
创建工程
添加MapReduce 依赖
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.xzdream.hive</groupId>
<artifactId>xzdream-hive</artifactId>
<version>1.0</version>
<name>xzdream-hive</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<!-- Hadoop 版本-->
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
</properties>
<!--添加cdh仓库-->
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<!--添加hadoop依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
程序编写,进行数据清洗,保存到hdfs
准备好清洗的数据
rinse.txt
127.0.0.1 http://www.localhost.com a
192.168.1.1 http://www.xzdream.cn
192.168.2.3 http://blog.xzdream.cn
将文件put到hadoop
hadoop$ ./hadoop fs -mkdir -p /hive/rinse
hadoop$ ./hadoop fs -put /Users/hadoop/data/rinse.txt /hive/rinse
hadoop 提交jar
hadoop$ ./hadoop jar /Users/hadoop/libs/xzdream-hive-1.0.jar com.xzdream.hive.mapreduce.driver.LogETLDriver /hive/rinse /hive/rinse/day=20200606
查看清洗完成的数据
hadoop$ ./hadoop fs -cat /hive/rinse/output/part-r-00000
20/06/06 17:33:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
192.168.1.1 http://www.xzdream.cn
127.0.0.1 http://www.baidu.com
修改数据库字符集
alter database hive_db character set latin1;
FLUSH PRIVILEGES;
使用hive进行统计
1:创建外部表
create external table rinse(
ip string,
domain string
) partitioned by (day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/hive/rinse/access/clear';
create external table rinse(
ip string,
domain string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/hive/rinse/access/clear';
移动数据
hadoop$ ./hadoop fs -mkdir -p /hive/rinse/access/clear/day=20200606/
./hadoop fs -mv /hive/rinse/day=20200606/part-r-00000 /hive/rinse/access/clear/day=20200606/
将数据刷进hive
alter table rinse add if not exists partition(day='20200606’);
hive (default)> select * from rinse;
OK
192.168.2.3 http://blog.xzdream.cn 20200606
192.168.1.1 http://www.xzdream.cn 20200606
Time taken: 0.115 seconds, Fetched: 2 row(s)
hive (default)>
hive (default)> select count(*),domain from rinse group by domain;
1 http://blog.xzdream.cn
1 http://www.xzdream.cn