大数据学习记录目录
2021.01.28
- SSH免密服务
1. 生成ssh公钥和私钥 ssh-keygen -t rsa
2. 把集群里面的公钥发到node1中 ssh-copy-id node1
3. 集群node1向每台机子都执行 scp /root/.ssh/公钥名称 node02:/root/.ssh
2.时钟同步
安装ntp服务
1. yum install ntp -y
2. 启动定时服务 crontab -e
3. 写入内容 */1 * * * * /usr/sbin.ntpdate ntp4.aliyun,com;
- 安装mysql服务
参考链接:[https://www.jianshu.com/p/276d59cbc529](https://www.jianshu.com/p/276d59cbc529)
4.zookeeper集群搭建
1.在/root新建zookeeper文件 vim zookeeper
2.进入zookeeper文件 cd zookeeper
3.下载zookeeper wget http://archive.apache.org/dist/zookeeper/zookeeper-
3.4.9/zookeeper-3.4.9.tar.gz
网站:http://archive.apache.org/dist/zookeeper/zookeeper-3.4.9/
4.解压文件 tar zxvf zookeeper-3.4.9
5.切换文件夹 cd zookeeper-3.4.9/conf
6.cp zoo_sample.cfg zoo.cfg
7.编辑文件
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just
# example sakes.
dataDir=/root/zookeeper/zookeeper-3.4.9/zkdatas
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
autopurge.purgeInterval=1
server.1=node1:2888:3888
server.2=node2:2888:3888
server.3=node3:2888:3888
8.切换到zsdata文件,新建文件 vim myid 写入1
9.配置环境变量
vim /etc/profle
#set zookeeper environment
export ZK_HOME=/root/zookeeper/zookeeper-3.4.9
export PATH=$PATH:$ZK_HOME/bin
然后通过如下命令使得环境变量生效:
source /etc/profle
10. 启动命令:
zkServer.sh start
停止命令:
zkServer.sh stop
重启命令:
zkServer.sh restart
查看集群节点状态:
zkServer.sh status
11.复制到各个服务器
scp /root/zookeeper/zookeeper-3.4.9/ node2:/root/zookeeper/
scp /root/zookeeper/zookeeper-3.4.9/ node3:/root/zookeeper/
12.修改各个服务器的myid文件
启动服务即可
2021.02.01
5.zookeeper命令的使用
进入命令行模式:bin/zkCli.sh -server node1:2181
创建znode:create 路径 数据
-e临时节点
-s永久序列化节点
删除: rmr 路径
获取节点数据:get 路径
修改节点: set 路径 数据
watch机制
get 路径 watch
zookeeperAPI的使用
maven配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>bigdata</groupId>
<artifactId>bigdata</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>com.google.collections</groupId>
<artifactId>google-collections</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.25</version>
</dependency>
</dependencies>
</project>
API代码
package zookeeper;
import org.apache.curator.RetryPolicy;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.framework.recipes.cache.ChildData;
import org.apache.curator.framework.recipes.cache.TreeCache;
import org.apache.curator.framework.recipes.cache.TreeCacheEvent;
import org.apache.curator.framework.recipes.cache.TreeCacheListener;
import org.apache.curator.retry.ExponentialBackoffRetry;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.data.Stat;
import org.junit.Test;
public class znodecreate {
@Test
public void createznode() throws Exception {
System.out.print("111");
// 重新连接时间,次数
RetryPolicy retryPolicy=new ExponentialBackoffRetry(1000,1);
String ip="192.168.112.128:2181,192.168.112.129:2181,192.168.112.130:2181";
String ips="8.135.44.130:2181";
CuratorFramework curatorFramework = CuratorFrameworkFactory.newClient(ip, 8000, 8000, retryPolicy);
curatorFramework.start();
// curatorFramework.create().creatingParentsIfNeeded().withMode(CreateMode.EPHEMERAL).forPath("/hello/a","word".getBytes());
curatorFramework.setData().forPath("/hello","hello".getBytes());
byte[] stat = curatorFramework.getData().forPath("/hello");
System.out.print(new String(stat));
curatorFramework.close();
}
@Test
public void watchznode() throws Exception {
RetryPolicy retryPolicy=new ExponentialBackoffRetry(1000,1);
String ip="192.168.112.128:2181,192.168.112.129:2181,192.168.112.130:2181";
String ips="8.135.44.130:2181";
CuratorFramework curatorFramework = CuratorFrameworkFactory.newClient(ip, 8000, 8000, retryPolicy);
curatorFramework.start();
final TreeCache treeCache=new TreeCache(curatorFramework,"/hello");
// 自定义监听器
treeCache.getListenable().addListener(new TreeCacheListener() {
public void childEvent(CuratorFramework curatorFramework, TreeCacheEvent treeCacheEvent) throws Exception {
ChildData data=treeCacheEvent.getData();
switch (treeCacheEvent.getType()){
case NODE_ADDED:
System.out.println("监听到有新节点!");
break;
case NODE_REMOVED:
System.out.println("监听到有节点移除!");
break;
case NODE_UPDATED:
System.out.println("监听到有节点更新!");
break;
default:
break;
}
}
});
treeCache.start();
Thread.sleep(1000000);
}
}
2012.02.02
HDFSAPI连接配置
1:在Windows下配置Hadoop的运行环境
第一步:将hadoop2.7.5文件夹拷贝到一个没有中文没有空格的路径下面
第二步:在windows上面配置hadoop的环境变量: HADOOP_HOME,并将%HADOOP_HOME%\bin添加到path中
第三步:把hadoop2.7.5文件夹中bin目录下的hadoop.dll文件放到系统盘:C:\Windows\System32 目录
第四步:关闭windows重启
idea导入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.itcast</groupId>
<artifactId>day04_hdfs_api_demo</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
API代码
package hdfs.url.wjxt;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.Test;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class File {
FileSystem fileSystem=FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
public File() throws IOException, URISyntaxException {
}
@Test
public void getFileSystem1() throws IOException {
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS","hdfs://8.135.44.130:9000");
// configuration.set("fs.defaultFS","hdfs://192.168.112.128:8020");
FileSystem fileSystem = FileSystem.get(configuration);
System.out.print(fileSystem);
}
@Test
public void listFile() throws IOException, URISyntaxException {
RemoteIterator<LocatedFileStatus> iterator=fileSystem.listFiles(new Path("/"),true);
while (iterator.hasNext()){
LocatedFileStatus fileStatus=iterator.next();
System.out.println(fileStatus.getPath()+"----"+fileStatus.getPath().getName());
BlockLocation[] blockLocations=fileStatus.getBlockLocations();
System.out.println(blockLocations.length);
System.out.println(blockLocations.hashCode());
}
fileSystem.close();
}
@Test
public void mkdieFile() throws IOException {
boolean mkdirs = fileSystem.mkdirs(new Path("/mp4"));
// if (mkdirs){
// fileSystem.create(new Path("/aaa/aa.txt"));
// }
System.out.println(mkdirs);
fileSystem.close();
}
@Test
public void dowloadFile() throws IOException {
FSDataInputStream inputStream = fileSystem.open(new Path("/aaa/aa.txt"));
FileOutputStream fileOutputStream = new FileOutputStream("F://a.txt");
IOUtils.copy(inputStream,fileOutputStream);
IOUtils.closeQuietly(fileOutputStream);
IOUtils.closeQuietly(inputStream);
fileOutputStream.close();
}
@Test
public void dowloadFile1() throws IOException {
fileSystem.copyToLocalFile(new Path("/aaa/aa.txt"),new Path("F://b.txt"));
fileSystem.close();
}
@Test
public void uploadFile() throws IOException {
fileSystem.copyFromLocalFile(new Path("C:\\Users\\asus\\Music\\囚鸟.mp3"),new Path("/mp4/"));
fileSystem.close();
}
//小文件合并后上传
@Test
public void biguploadFile(){
}
}
1-HDFS的高可用机制
2021.02.04
wordcount案例代码
jobmain类
package wordcount;
import jdk.nashorn.internal.scripts.JO;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class jobmain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception{
Job job= Job.getInstance(super.getConf(),"wordcount");
job.setJarByClass(jobmain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/wordcount"));
// TextInputFormat.addInputPath(job, new Path("file:///E:\\mapreduce\\input"));
//第二步:指定Map阶段的处理方式和数据类型
job.setMapperClass(map.class);
//设置Map阶段K2的类型
job.setMapOutputKeyClass(Text.class);
//设置Map阶段V2的类型
job.setMapOutputValueClass(LongWritable.class);
//第三,四,五,六 采用默认的方式
//第七步:指定Reduce阶段的处理方式和数据类型
job.setReducerClass(reduce.class);
//设置K3的类型
job.setOutputKeyClass(Text.class);
//设置V3的类型
job.setOutputValueClass(LongWritable.class);
//第八步: 设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
job.setOutputFormatClass(TextOutputFormat.class);
Path path = new Path("hdfs://node1:8020/wordcount_out");
// TextOutputFormat.setOutputPath(job, new Path("file:///E:\\mapreduce\\output"));
TextOutputFormat.setOutputPath(job,path);
FileSystem fileSystem = FileSystem.get(new URI("hdfs://node1:8020"), new Configuration());
// 判断目录是否存在
boolean bl2 = fileSystem.exists(path);
if(bl2){
//删除目标目录
fileSystem.delete(path, true);
}
boolean bl = job.waitForCompletion(true);
return bl ? 0:1;
}
public static void main(String[] args)throws Exception{
Configuration configuration = new Configuration();
//启动job任务
int run = ToolRunner.run(configuration, new jobmain(), args);
System.exit(run);
}
}
map类
package wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class map extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text text = new Text();
LongWritable longWritable = new LongWritable();
//1:将一行的文本数据进行拆分
String[] split = value.toString().split(",");
//2:遍历数组,组装 K2 和 V2
for (String word : split) {
//3:将K2和V2写入上下文
text.set(word);
longWritable.set(1);
if (text.toString().equals("word")){
context.write(text, longWritable);
}
}
}
}
reduce类
package wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
long a=0;
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
//1:遍历集合,将集合中的数字相加,得到 V3
if(key.toString().equals("word")){
for (LongWritable value : values) {
count += value.get();
a+=1;
}
}
//2:将K3和V3写入上下文中
System.out.println(key.toString().equals("word"));
context.write(key, new LongWritable(count));
}
}
2021.02.05
shuffle分区partitioner代码
map类
package pratitioner;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class map extends Mapper<LongWritable,Text,Text,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value, NullWritable.get());
}
}
reduce类
package pratitioner;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reduce extends Reducer<Text,NullWritable,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
partitioner类
package pratitioner;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class mypratitoner extends Partitioner<Text,NullWritable> {
@Override
public int getPartition(Text text, NullWritable nullWritable, int i) {
String[] split = text.toString().split(",");
String s=split[2];
System.out.println(s);
if(Integer.valueOf(s)>10){
return 1;
}else {
return 0;
}
}
}
job类
package pratitioner;
import hdfs.wjxt.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class job extends Configured implements Tool{
@Override
public int run(String[] strings) throws Exception {
Job job= Job.getInstance(super.getConf(),"partitioner");
job.setJarByClass(job.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/input"));
job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setPartitionerClass(mypratitoner.class);
job.setReducerClass(reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(2);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/out/partition"));
FileSystem fileSystem= FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
boolean file = fileSystem.exists(new Path("hdfs://node1:8020/out/partition"));
if (file){
fileSystem.delete(new Path("hdfs://node1:8020/out/partition"),true);
}
boolean b = job.waitForCompletion(true);
return b ? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
int run = ToolRunner.run(configuration, new job(), args);
System.exit(run);
}
}
配置log4j日志
# Configure logging for testing: optionally with log file
#log4j.rootLogger=debug,appender
log4j.rootLogger=info,appender
#log4j.rootLogger=error,appender
#\u8F93\u51FA\u5230\u63A7\u5236\u53F0
log4j.appender.appender=org.apache.log4j.ConsoleAppender
#\u6837\u5F0F\u4E3ATTCCLayout
log4j.appender.appender.layout=org.apache.log4j.TTCCLayout
案例
20210210
流量排序
map代码
package fllow;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class folwmapp extends Mapper<LongWritable,Text,Text,FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
FlowBean flowBean = new FlowBean();
String[] split = value.toString().split("\t");
String phonenumber=split[1];
flowBean.setUpFlow(Integer.valueOf(split[6]));
flowBean.setDownFlow(Integer.valueOf(split[7]));
flowBean.setUpcountFlow(Integer.valueOf(split[8]));
flowBean.setDownCountFlow(Integer.valueOf(split[9]));
context.write(new Text(phonenumber),flowBean);
}
}
reduce代码
package fllow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class flowreduce extends Reducer<Text,FlowBean,Text,FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
Integer upFlow=0;
Integer downFlow=0;
Integer upcountFlow=0;
Integer downCountFlow=0;
for(FlowBean i:values){
upFlow+=i.getUpFlow();
downFlow+=i.getDownFlow();
upcountFlow+=i.getUpcountFlow();
downCountFlow+=i.getDownCountFlow();
}
FlowBean flowBean=new FlowBean();
flowBean.setUpFlow(upFlow);
flowBean.setDownFlow(downFlow);
flowBean.setUpcountFlow(upcountFlow);
flowBean.setDownCountFlow(downCountFlow);
context.write(key,flowBean);
}
}
分区代码
package fllow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class flowpartition extends Partitioner<Text,FlowBean>{
@Override
public int getPartition(Text text, FlowBean flowBean, int i) {
String phonenumber = text.toString();
if (phonenumber.startsWith("135")){
return 0;
}else if (phonenumber.startsWith("136")){
return 1;
}else {
return 2;
}
}
}
排序代码
package fllow;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
public class FlowBean implements WritableComparable<FlowBean>{
private Integer upFlow;
private Integer downFlow;
private Integer upcountFlow;
private Integer downCountFlow;
public Integer getUpFlow() {
return upFlow;
}
public void setUpFlow(Integer upFlow) {
this.upFlow = upFlow;
}
public Integer getDownFlow() {
return downFlow;
}
public void setDownFlow(Integer downFlow) {
this.downFlow = downFlow;
}
public Integer getUpcountFlow() {
return upcountFlow;
}
public void setUpcountFlow(Integer upcountFlow) {
this.upcountFlow = upcountFlow;
}
public Integer getDownCountFlow() {
return downCountFlow;
}
public void setDownCountFlow(Integer downCountFlow) {
this.downCountFlow = downCountFlow;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(upFlow);
dataOutput.writeInt(downFlow);
dataOutput.writeInt(upcountFlow);
dataOutput.writeInt(downCountFlow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.upFlow=dataInput.readInt();
this.downFlow=dataInput.readInt();
this.upcountFlow=dataInput.readInt();
this.downCountFlow=dataInput.readInt();
}
@Override
public String toString() {
return " " +
" " + upFlow +
" " + downFlow +
" " + upcountFlow +
" " + downCountFlow ;
}
@Override
public int compareTo(FlowBean o) {
System.out.println("------------------------------"+o);
return this.upFlow - o.upFlow;
}
}
job代码
package fllow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class flowjob extends Configured implements Tool{
@Override
public int run(String[] strings) throws Exception {
Job job=Job.getInstance(super.getConf(),"flowjob");
job.setJarByClass(flowjob.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/flowinput"));
job.setMapperClass(folwmapp.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setPartitionerClass(flowpartition.class);
job.setReducerClass(flowreduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
job.setNumReduceTasks(3);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/flowoutput/par"));
FileSystem fileSystem=FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
if(fileSystem.exists(new Path("hdfs://node1:8020/flowoutput"))){
fileSystem.delete(new Path("hdfs://node1:8020/flowoutput"),true);
}
boolean b=job.waitForCompletion(true);
return b ? 1:0;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
int run = ToolRunner.run(configuration, new flowjob(), args);
System.exit(run);
}
}
20210215
join文件连接
map代码
package join;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class joinduce extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String coun1="";
String coun2="";
for (Text i:values){
if(i.toString().startsWith("p")){
coun1=i.toString();
}else {
coun2=i.toString();
}
}
System.out.println(coun1);
System.out.println(coun2);
context.write(key,new Text(coun1+coun2));
}
}
reduce代码
package join;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class joinduce extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String coun1="";
String coun2="";
for (Text i:values){
if(i.toString().startsWith("p")){
coun1=i.toString();
}else {
coun2=i.toString();
}
}
System.out.println(coun1);
System.out.println(coun2);
context.write(key,new Text(coun1+coun2));
}
}
job代码
package join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.omg.CORBA.PUBLIC_MEMBER;
import pratitioner.job;
import pratitioner.map;
import pratitioner.mypratitoner;
import pratitioner.reduce;
import java.net.URI;
public class joinjob extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job= Job.getInstance(super.getConf(),"partitioner");
job.setJarByClass(joinjob.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/joininput"));
job.setMapperClass(joinmap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(joinduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/joinouput"));
FileSystem fileSystem= FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
boolean file = fileSystem.exists(new Path("hdfs://node1:8020/joinouput"));
if (file){
fileSystem.delete(new Path("hdfs://node1:8020/joinouput"),true);
}
boolean b = job.waitForCompletion(true);
return b ? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
int run = ToolRunner.run(configuration, new joinjob(), args);
System.exit(run);
}
}
20210216
共同好友的统计
一共用到两个mapreduce
产生中间文件
第一步
```c
map代码
package commonfriend;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class map extends Mapper<LongWritable,Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(":");
String[] split1 = split[1].toString().split(",");
for (String i:split1){
context.write(new Text(i),new Text(split[0]));
}
}
}
reduce代码
package commonfriend;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reduce extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String coun="";
for (Text i:values){
coun+=i.toString()+"-";
}
context.write(new Text(coun),key);
}
}
job代码
package commonfriend;
import join.joinduce;
import join.joinjob;
import join.joinmap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class job extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job= Job.getInstance(super.getConf(),"partitioner");
job.setJarByClass(job.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/common/input1"));
job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/common/ouput1"));
FileSystem fileSystem= FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
boolean file = fileSystem.exists(new Path("hdfs://node1:8020/common/ouput1"));
if (file){
fileSystem.delete(new Path("hdfs://node1:8020/common/ouput1"),true);
}
boolean b = job.waitForCompletion(true);
return b ? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
int run = ToolRunner.run(configuration, new job(), args);
System.exit(run);
}
}
第二步
处理中间文件
```c
map代码
package commonfriend;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.lang.reflect.Array;
import java.util.Arrays;
public class map2 extends Mapper<LongWritable,Text,Text,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
String[] split1 = split[0].toString().split("-");
Arrays.sort(split1);
System.out.println(split1[0]);
for(int i=0;i<split1.length-1;i++){
for (int j=1;j<split1.length;j++){
if (!split1[i].equals(split1[j])){
context.write(new Text(split1[i]+"-"+split1[j]),new Text(split[1]));
}
}
}
}
}
reduce代码
package commonfriend;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reduce2 extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String coun="";
for (Text i:values){
coun+=i.toString()+"-";
}
String s=coun.substring(0,coun.length()-1);
context.write(key,new Text(s));
}
}
job代码
package commonfriend;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class job2 extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job= Job.getInstance(super.getConf(),"partitioner");
job.setJarByClass(job2.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/common/ouput1"));
job.setMapperClass(map2.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(reduce2.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/common/ouput2"));
FileSystem fileSystem= FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
boolean file = fileSystem.exists(new Path("hdfs://node1:8020/common/ouput2"));
if (file){
fileSystem.delete(new Path("hdfs://node1:8020/common/ouput2"),true);
}
boolean b = job.waitForCompletion(true);
return b ? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
int run = ToolRunner.run(configuration, new job2(), args);
System.exit(run);
}
}
文件合并
重新输入类
package format;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class MyINputFormat extends FileInputFormat<NullWritable,BytesWritable> {
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
MyRecordReader myRecordReader=new MyRecordReader();
myRecordReader.initialize(inputSplit,taskAttemptContext);
return myRecordReader;
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
package format;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class MyRecordReader extends RecordReader<NullWritable,BytesWritable> {
Configuration configuration=null;
FileSplit fileSplit=null;
boolean processec=false;
BytesWritable byteWritable=new BytesWritable();
FileSystem fileSystem=null;
FSDataInputStream fsDataInputStream=null;
// 用于初始化
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
configuration=taskAttemptContext.getConfiguration();
fileSplit= (FileSplit) inputSplit;
}
//用于获取k1,v1
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!processec){
fileSystem=FileSystem.get(configuration);
System.out.println(fileSplit.getPath());
fsDataInputStream=fileSystem.open(fileSplit.getPath());
byte[] bytes=new byte[(int) fileSplit.getLength()];
IOUtils.readFully(fsDataInputStream,bytes,0, (int) fileSplit.getLength());
byteWritable.set(bytes,0, (int) fileSplit.getLength());
processec=true;
return true;
}
return false;
}
//返回k1
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
//返回v1
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return byteWritable;
}
// 获取文件读取进度
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
fileSystem.close();
fsDataInputStream.close();
}
}
map代码
package format;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class map extends Mapper<NullWritable,BytesWritable,Text,BytesWritable>{
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
context.write(new Text(name),value);
}
}
job代码
package format;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class job extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job=Job.getInstance(super.getConf(),"format");
job.setInputFormatClass(MyINputFormat.class);
MyINputFormat.addInputPath(job,new Path("file:///C:\\Users\\asus\\Desktop\\jar"));
job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
Path path=new Path("hdfs://node1:8020/format/output");
SequenceFileOutputFormat.setOutputPath(job,path);
FileSystem fileSystem=FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
if(fileSystem.exists(path)){
fileSystem.delete(path,true);
}
boolean b = job.waitForCompletion(true);
return b ? 0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
int run = ToolRunner.run(configuration, new job(), args);
System.out.println(run);
}
}
20210217
自定义文件输出OutputFormat
map类
package outputFormat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class map extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
MyOutputFormat 类
package outputFormat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyOutputFormat extends FileOutputFormat<Text,NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
FSDataOutputStream fsDataOutputStream = fileSystem.create(new Path("hdfs://node1:8020/output1/output"));
FSDataOutputStream fsDataOutputStream1 = fileSystem.create(new Path("hdfs://node1:8020/output1/output1"));
MyRecordWriter myRecordWriter = new MyRecordWriter(fsDataOutputStream, fsDataOutputStream1);
return myRecordWriter;
}
}
MyRecordWriter 类
package outputFormat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class MyRecordWriter extends RecordWriter<Text,NullWritable> {
FSDataOutputStream fsDataOutputStream ;
FSDataOutputStream fsDataOutputStream1;
public MyRecordWriter(FSDataOutputStream fsDataOutputStream, FSDataOutputStream fsDataOutputStream1) {
this.fsDataOutputStream = fsDataOutputStream;
this.fsDataOutputStream1 = fsDataOutputStream1;
}
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
String[] split = text.toString().split("\t");
int jiu=Integer.valueOf(split[9]);
if (jiu<=1){
fsDataOutputStream.write(text.toString().getBytes());
fsDataOutputStream.write("\r\n".getBytes());
}else {
fsDataOutputStream1.write(text.toString().getBytes());
fsDataOutputStream1.write("\r\n".getBytes());
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
// fsDataOutputStream.close();
// fsDataOutputStream1.close();
IOUtils.closeStream(fsDataOutputStream);
IOUtils.closeStream(fsDataOutputStream1);
}
}
job类
package outputFormat;
import commonfriend.map;
import commonfriend.reduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class job extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job= Job.getInstance(super.getConf(),"partitioner");
job.setJarByClass(job.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/output/input"));
job.setMapperClass(outputFormat.map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputFormatClass(MyOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/output1"));
FileSystem fileSystem= FileSystem.get(new URI("hdfs://node1:8020"),new Configuration());
boolean file = fileSystem.exists(new Path("hdfs://node1:8020/output1"));
boolean file1 = fileSystem.exists(new Path("hdfs://node1:8020/output1"));
if (file&&file1){
fileSystem.delete(new Path("hdfs://node1:8020/output1"),true);
fileSystem.delete(new Path("hdfs://node1:8020/output1"),true);
}
boolean b = job.waitForCompletion(true);
return b ? 0:1;
}
public static void main(String[] args)throws Exception{
Configuration configuration=new Configuration();
// configuration.addResource(new Path("/root/hadoop/hadoop-2.7.5/etc/hadoop/core-site.xml"));
// configuration.set("fs.defaultFS", "hdfs://node1:8020");
int run = ToolRunner.run(configuration, new job(), args);
System.exit(run);
}
}
20210218
hive安装
cd /export/softwares/
tar -zxvf apache-hive-2.1.1-bin.tar.gz -C ../servers/
修改hive-env.sh
cd /export/servers/apache-hive-2.1.1-bin/conf
cp hive-env.sh.template hive-env.sh
HADOOP_HOME=/export/servers/hadoop-2.7.5
export HIVE_CONF_DIR=/export/servers/apache-hive-2.1.1-bin/conf
修改hive-site.xml
cd /export/servers/apache-hive-2.1.1-bin/conf
vim hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://node03:3306/hive?
createDatabaseIfNotExist=true&useSSL=false</value>
</property>
hive使用mysql作为元数据存储,必然需要连接mysql数据库,所以我们添加一个mysql的连接
驱动包到hive的安装目录下,然后就可以准备启动hive了
将我们准备好的mysql-connector-java-5.1.38.jar 这个jar包直接上传到
/export/servers/apache-hive-2.1.1-bin/lib 这个目录下即可
至此,hive的安装部署已经完成,接下来我们来看下hive的三种交互方式
第五步:配置hive的环境变量
node03服务器执行以下命令配置hive的环境变量
2.6. Hive 的交互方式
第一种交互方式 bin/hive
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>node03</value>
</property>
</configuration>
sudo vim /etc/profile
export HIVE_HOME=/export/servers/apache-hive-2.1.1-bin
export PATH=:$HIVE_HOME/bin:$PATH
20210220
hive操作学习
hql语句
20210227
安装
上传解压HBase安装包
tar -xvzf hbase-2.1.0.tar.gz -C ../server/
修改HBase配置文件
hbase-env.sh
cd /export/server/hbase-2.1.0/conf
vim hbase-env.sh
# 第28行
export JAVA_HOME=/export/server/jdk1.8.0_241/
export HBASE_MANAGES_ZK=false
2.1.2.2 hbase-site.xml
vim hbase-site.xml
------------------------------
<configuration>
<!-- HBase数据在HDFS中的存放的路径 -->
<property>
<name>hbase.rootdir</name>
<value>hdfs://node1.itcast.cn:8020/hbase</value>
</property>
<!-- Hbase的运行模式。false是单机模式,true是分布式模式。若为false,Hbase和Zookeeper会运行在同一个JVM里面 -->
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<!-- ZooKeeper的地址 -->
<property>
<name>hbase.zookeeper.quorum</name>
<value>node1.itcast.cn,node2.itcast.cn,node3.itcast.cn</value>
</property>
<!-- ZooKeeper快照的存储位置 -->
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/export/server/apache-zookeeper-3.6.0-bin/data</value>
</property>
<!-- V2.1版本,在分布式情况下, 设置为false -->
<property>
<name>hbase.unsafe.stream.capability.enforce</name>
<value>false</value>
</property>
</configuration>
配置环境变量
# 配置Hbase环境变量
vim /etc/profile
export HBASE_HOME=/export/server/hbase-2.1.0
export PATH=$PATH:${HBASE_HOME}/bin:${HBASE_HOME}/sbin
#加载环境变量
source /etc/profile
2.1.4 复制jar包到lib
cp $HBASE_HOME/lib/client-facing-thirdparty/htrace-core-3.1.0-incubating.jar $HBASE_HOME/lib/
修改regionservers文件
vim regionservers
node1.itcast.cn
node2.itcast.cn
node3.itcast.cn
分发安装包与配置文件
cd /export/server
scp -r hbase-2.1.0/ node2.itcast.cn:$PWD
scp -r hbase-2.1.0/ node3.itcast.cn:$PWD
scp -r /etc/profile node2.itcast.cn:/etc
scp -r /etc/profile node3.itcast.cn:/etc
在node2.itcast.cn和node3.itcast.cn加载环境变量
source /etc/profile
启动HBase
cd /export/onekey
# 启动ZK
./start-zk.sh
# 启动hadoop
start-dfs.sh
# 启动hbase
start-hbase.sh
2.1.8 验证Hbase是否启动成功
# 启动hbase shell客户端
hbase shell
# 输入status
[root@node1 onekey]# hbase shell
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/export/server/hadoop-2.7.5/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/export/server/hbase-2.1.0/lib/client-facing-thirdparty/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
HBase Shell
Use "help" to get list of supported commands.
Use "exit" to quit this interactive shell.
Version 2.1.0, re1673bb0bbfea21d6e5dba73e013b09b8b49b89b, Tue Jul 10 17:26:48 CST 2018
Took 0.0034 seconds
Ignoring executable-hooks-1.6.0 because its extensions are not built. Try: gem pristine executable-hooks --version 1.6.0
Ignoring gem-wrappers-1.4.0 because its extensions are not built. Try: gem pristine gem-wrappers --version 1.4.0
2.4.1 :001 > status
1 active master, 0 backup masters, 3 servers, 0 dead, 0.6667 average load
Took 0.4562 seconds
2.4.1 :002 >
WebUI
http://node1.itcast.cn:16010/master-status
20210301
javaapi操作
1.导入依赖
<repositories><!-- 代码库 -->
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
<updatePolicy>never</updatePolicy>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.14.3</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<target>1.8</target>
<source>1.8</source>
</configuration>
</plugin>
</plugins>
</build>
2 复制HBase和Hadoop配置文件
将以下三个配置文件复制到resource目录中
hbase-site.xml
从Linux中下载:sz /export/server/hbase-2.1.0/conf/hbase-site.xml
core-site.xml
从Linux中下载:sz /export/server/hadoop-2.7.5/etc/hadoop/core-site.xml
log4j.properties
创建表
package api_test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.testng.annotations.AfterTest;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import java.io.IOException;
public class TableminTest {
private Connection connection;
private Admin admin;
@BeforeTest
public void beforeTest() throws IOException {
Configuration configuration= HBaseConfiguration.create();
connection= ConnectionFactory.createConnection(configuration);
admin=connection.getAdmin();
}
@Test
public void createtable() throws IOException {
TableName tableName = TableName.valueOf("WATER_BILL");
// 1. 判断表是否存在
if(admin.tableExists(tableName)) {
// a) 存在,则退出
return;
}
// 构建表
// 2. 使用TableDescriptorBuilder.newBuilder构建表描述构建器
// TableDescriptor: 表描述器,描述这个表有几个列蔟、其他的属性都是在这里可以配置
TableDescriptorBuilder tableDescriptorBuilder = TableDescriptorBuilder.newBuilder(tableName);
// 3. 使用ColumnFamilyDescriptorBuilder.newBuilder构建列蔟描述构建器
// 创建列蔟也需要有列蔟的描述器,需要用一个构建起来构建ColumnFamilyDescriptor
// 经常会使用到一个工具类:Bytes(hbase包下的Bytes工具类)
// 这个工具类可以将字符串、long、double类型转换成byte[]数组
// 也可以将byte[]数组转换为指定类型
ColumnFamilyDescriptorBuilder columnFamilyDescriptorBuilder = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("C1"));
// 4. 构建列蔟描述,构建表描述
ColumnFamilyDescriptor cfDes = columnFamilyDescriptorBuilder.build();
// 建立表和列蔟的关联
tableDescriptorBuilder.setColumnFamily(cfDes);
TableDescriptor tableDescriptor = tableDescriptorBuilder.build();
// 5. 创建表
admin.createTable(tableDescriptor);
}
@Test
public void droptableTest() throws IOException {
TableName tableName=TableName.valueOf("WATER_BILL");
if (admin.tableExists(tableName)){
admin.disableTable(tableName);
admin.deleteTable(tableName);
}
}
@AfterTest
public void afterTest() throws IOException {
admin.close();
connection.close();
}
}
操作表
package data_text;
import com.sun.org.apache.regexp.internal.RE;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CompareOperator;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.SingleColumnValueExcludeFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.testng.annotations.AfterTest;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class Datatest {
private Connection connection;
@BeforeTest
public void beforeTest() throws IOException {
Configuration configuration= HBaseConfiguration.create();
connection= ConnectionFactory.createConnection(configuration);
}
@Test
public void putTest() throws IOException {
Table tablename = connection.getTable(TableName.valueOf("mai"));
ArrayList<String> strings = toArrayByFileReader1();
for (String i : strings){
try {
String[] split = i.split(" ");
String rowKey=split[3];
String colunName="C1";
// String name="NAME";
Put put = new Put(Bytes.toBytes(rowKey));
System.out.println(split.length);
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("one"),Bytes.toBytes(split[0]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("two"),Bytes.toBytes(split[1]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("three"),Bytes.toBytes(split[2]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("four"),Bytes.toBytes(split[3]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("five"),Bytes.toBytes(split[4]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("six"),Bytes.toBytes(split[5]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("seven"),Bytes.toBytes(split[6]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("eight"),Bytes.toBytes(split[7]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("nine"),Bytes.toBytes(split[8]));
put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes("ten"),Bytes.toBytes(split[9]));
tablename.put(put);
tablename.close();
}catch (Exception e){
}
}
// put.addColumn(Bytes.toBytes(colunName),Bytes.toBytes(name),Bytes.toBytes("朱国滔"));
}
// 读取数据
@Test
public void getdata() throws IOException {
Table tablename = connection.getTable(TableName.valueOf("WATER_BILL"));
Get get = new Get(Bytes.toBytes("00001"));
Result resul=tablename.get(get);
List<Cell> cellList=resul.listCells();
byte[] rowkey=resul.getRow();
System.out.println(Bytes.toString(rowkey));
System.out.println(rowkey);
for (Cell cell:cellList){
String s = Bytes.toString(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength());
String s1 = Bytes.toString(cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
String s2 = Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
System.out.println(s1+":"+s1+"->"+s2);
}
tablename.close();
}
@Test
public void dropdata() throws IOException {
Table tablename = connection.getTable(TableName.valueOf("WATER_BILL"));
Delete delete=new Delete(Bytes.toBytes("00001"));
tablename.delete(delete);
tablename.close();
}
@Test
public void scandata() throws IOException {
Table tablename = connection.getTable(TableName.valueOf("WATER_BILL"));
Scan scan = new Scan();
SingleColumnValueExcludeFilter startsingleColumnValueExcludeFilter = new SingleColumnValueExcludeFilter(Bytes.toBytes("C1")
, Bytes.toBytes("RECORD_DATE")
, CompareOperator.GREATER_OR_EQUAL,
new BinaryComparator(Bytes.toBytes("2020-06-01")));
SingleColumnValueExcludeFilter stopsingleColumnValueExcludeFilter = new SingleColumnValueExcludeFilter(Bytes.toBytes("C1")
, Bytes.toBytes("RECORD_DATE")
, CompareOperator.LESS_OR_EQUAL,
new BinaryComparator(Bytes.toBytes("2020-06-30")));
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL, startsingleColumnValueExcludeFilter, stopsingleColumnValueExcludeFilter);
scan.setFilter(filterList);
ResultScanner results=tablename.getScanner(scan);
Iterator<Result> iterator = results.iterator();
while (iterator.hasNext()){
Result resul = iterator.next();
List<Cell> cellList=resul.listCells();
byte[] rowkey=resul.getRow();
System.out.println(Bytes.toString(rowkey));
System.out.println(rowkey);
String s2="";
for (Cell cell:cellList){
String s = Bytes.toString(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength());
String s1 = Bytes.toString(cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
if (s1.equals("NUM_CURRENT")||s1.equals("NUM_PREVIOUS")||s1.equals("NUM_USAGE")||s1.equals("TOTAL_MONEY")){
s2 = Bytes.toDouble(cell.getValueArray())+"";
}else {
s2 = Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
}
System.out.println(s+":"+s1+"->"+s2);
}
}
results.close();
tablename.close();
}
@Test
public ArrayList<String> toArrayByFileReader1() {
// 使用ArrayList来存储每行读取到的字符串
ArrayList<String> arrayList = new ArrayList<>();
try {
FileReader fr = new FileReader("C:\\Users\\asus\\Desktop\\data\\shuju.txt");
BufferedReader bf = new BufferedReader(fr);
String str;
// 按行读取字符串
while ((str = bf.readLine()) != null) {
arrayList.add(str);
}
bf.close();
fr.close();
} catch (IOException e) {
e.printStackTrace();
}
// 对ArrayList中存储的字符串进行处理
for (String i : arrayList){
String[] split = i.split(" ");
System.out.println(split.length);
System.out.println(split[0]);
}
// 返回数组
return arrayList;
}
@AfterTest
public void afterTest() throws IOException {
connection.close();
}
}
20210303
陌陌数据导入
package momo_chat.service.impl;
import momo_chat.entity.MSG;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CompareOperator;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class HBaseNativeceMessage implements ChatMessageService {
private Connection connection;
private SimpleDateFormat simpleDateFormat;
public HBaseNativeceMessage() throws IOException {
Configuration configuration = HBaseConfiguration.create();
connection= ConnectionFactory.createConnection(configuration);
}
@Override
public List<MSG> getMessage(String date, String sender, String receiver) throws Exception {
Scan scan = new Scan();
String startDateStr=date+" 00:00:00";
String endDateStr=date+" 23:59:59";
SingleColumnValueFilter startsingleColumnValueFilter = new SingleColumnValueFilter(Bytes.toBytes("C1")
, Bytes.toBytes("msg_time")
, CompareOperator.GREATER_OR_EQUAL
, new BinaryComparator(Bytes.toBytes(startDateStr)));
SingleColumnValueFilter endsingleColumnValueFilter = new SingleColumnValueFilter(Bytes.toBytes("C1")
, Bytes.toBytes("msg_time")
, CompareOperator.LESS_OR_EQUAL
, new BinaryComparator(Bytes.toBytes(endDateStr)));
SingleColumnValueFilter sendsingleColumnValueFilter = new SingleColumnValueFilter(Bytes.toBytes("C1")
, Bytes.toBytes("sender_account")
, CompareOperator.EQUAL
, new BinaryComparator(Bytes.toBytes(sender)));
SingleColumnValueFilter receiversingleColumnValueFilter = new SingleColumnValueFilter(Bytes.toBytes("C1")
, Bytes.toBytes("receiver_account")
, CompareOperator.EQUAL
, new BinaryComparator(Bytes.toBytes(receiver)));
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL
, startsingleColumnValueFilter
, endsingleColumnValueFilter
, sendsingleColumnValueFilter
, receiversingleColumnValueFilter);
scan.setFilter(filterList);
Table table = connection.getTable(TableName.valueOf("MOMO_CHAT:MSG"));
ResultScanner scanner = table.getScanner(scan);
Iterator<Result> iterable=scanner.iterator();
ArrayList<MSG> objects = new ArrayList<>();
while (iterable.hasNext()){
Result result=iterable.next();
MSG msg=new MSG();
String s = Bytes.toString(result.getRow());
List<Cell> cellList=result.listCells();
for (Cell cell:cellList){
String colimname=Bytes.toString(cell.getQualifierArray(),cell.getQualifierOffset(),cell.getQualifierLength());
if (colimname.equals("msg_time")){
msg.setMsg_time(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_nickyname")){
msg.setSender_nickyname(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_account")){
msg.setSender_account(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_sex")){
msg.setSender_sex(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_ip")){
msg.setSender_ip(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_os")){
msg.setSender_os(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_phone_type")){
msg.setSender_phone_type(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_network")){
msg.setSender_network(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("sender_gps")){
msg.setSender_gps(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_nickyname")){
msg.setReceiver_nickyname(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_ip")){
msg.setReceiver_ip(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_account")){
msg.setReceiver_account(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_os")){
msg.setReceiver_os(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_phone_type")){
msg.setReceiver_phone_type(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_network")){
msg.setReceiver_network(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_gps")){
msg.setReceiver_gps(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("receiver_sex")){
msg.setReceiver_sex(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("msg_type")){
msg.setMsg_type(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("distance")){
msg.setDistance(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
if(colimname.equals("message")){
msg.setMessage(Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()));
}
objects.add(msg);
}
scanner.close();
table.close();
}
return objects;
}
@Override
public void close() throws IOException {
connection.close();
}
}
20210304
6.2 安装Phoenix
6.2.1 下载
大家可以从官网上下载与HBase版本对应的Phoenix版本。对应到HBase 2.1,应该使用版本「5.0.0-HBase-2.0」。
http://phoenix.apache.org/download.html
也可以使用资料包中的安装包。
6.2.2 安装
1.上传安装包到Linux系统,并解压
cd /export/software
tar -xvzf apache-phoenix-5.0.0-HBase-2.0-bin.tar.gz -C ../server/
2.将phoenix的所有jar包添加到所有HBase RegionServer和Master的复制到HBase的lib目录
# 拷贝jar包到hbase lib目录
cp /export/server/apache-phoenix-5.0.0-HBase-2.0-bin/phoenix-*.jar /export/server/hbase-2.1.0/lib/
# 进入到hbase lib 目录
cd /export/server/hbase-2.1.0/lib/
# 分发jar包到每个HBase 节点
scp phoenix-*.jar node2.itcast.cn:$PWD
scp phoenix-*.jar node3.itcast.cn:$PWD
3.修改配置文件
cd /export/server/hbase-2.1.0/conf/
vim hbase-site.xml
------
# 1. 将以下配置添加到 hbase-site.xml 后边
<!-- 支持HBase命名空间映射 -->
<property>
<name>phoenix.schema.isNamespaceMappingEnabled</name>
<value>true</value>
</property>
<!-- 支持索引预写日志编码 -->
<property>
<name>hbase.regionserver.wal.codec</name>
<value>org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec</value>
</property>
# 2. 将hbase-site.xml分发到每个节点
scp hbase-site.xml node2.itcast.cn:$PWD
scp hbase-site.xml node3.itcast.cn:$PWD
4.将配置后的hbase-site.xml拷贝到phoenix的bin目录
cp /export/server/hbase-2.1.0/conf/hbase-site.xml /export/server/apache-phoenix-5.0.0-HBase-2.0-bin/bin/
5.重新启动HBase
stop-hbase.sh
start-hbase.sh
6.启动Phoenix客户端,连接Phoenix Server
注意:第一次启动Phoenix连接HBase会稍微慢一点。
cd /export/server/apache-phoenix-5.0.0-HBase-2.0-bin/
bin/sqlline.py node1.itcast.cn:2181
# 输入!table查看Phoenix中的表
20210306
建表语句:
create table if not exists ORDER_DTL(
ID varchar primary key,
C1.STATUS varchar,
C1.MONEY float,
C1.PAY_WAY integer,
C1.USER_ID varchar,
C1.OPERATION_TIME varchar,
C1.CATEGORY varchar
);
6.4 查看表的信息
!desc ORDER_DTL
6.4.1 删除表语法
drop table if exists ORDER_DTL;
6.4.2 大小写问题
在HBase中,如果在列蔟、列名没有添加双引号。Phoenix会自动转换为大写。
例如:
create table if not exists ORDER_DTL(
id varchar primary key,
C1.status varchar,
C1.money double,
C1.pay_way integer,
C1.user_id varchar,
C1.operation_time varchar,
C1.category varchar
);
插入数据
upsert into 表名(列蔟列名, xxxx, ) VALUES(XXX, XXX, XXX)
查看所有表
!table
6.4.4.3 根据ID查询数据
SELECT * FROM ORDER_DTL WHERE "id" = '000001';
6.4.5 根据ID删除数据
DELETE FROM ORDER_DTL WHERE "id" = '000001';
6.4.7 分页查询
使用limit和offset可以快速进行分页。
limit表示每页多少条记录,offset表示从第几条记录开始查起。
-- 第一页
select * from ORDER_DTL limit 10 offset 0;
-- 第二页
-- offset从10开始
select * from ORDER_DTL limit 10 offset 10;
-- 第三页
select * from ORDER_DTL limit 10 offset 20;
6.5.1 ROWKEY预分区
按照用户ID来分区,一共4个分区。并指定数据的压缩格式为GZ。
drop table if exists ORDER_DTL;
create table if not exists ORDER_DTL(
"id" varchar primary key,
C1."status" varchar,
C1."money" float,
C1."pay_way" integer,
C1."user_id" varchar,
C1."operation_time" varchar,
C1."category" varchar
)
CONPRESSION='GZ'
SPLIT ON ('3','5','7');
6.5.2 加盐指定数量分区
drop table if exists ORDER_DTL;
create table if not exists ORDER_DTL(
"id" varchar primary key,
C1."status" varchar,
C1."money" float,
C1."pay_way" integer,
C1."user_id" varchar,
C1."operation_time" varchar,
C1."category" varchar
)
CONPRESSION='GZ', SALT_BUCKETS=10;
映射HBase中的表
CREATE VIEW "my_hbase_table"
( k VARCHAR primary key, "v" UNSIGNED_LONG) default_column_family='a';
-- 映射Phoenix中的表
CREATE VIEW my_view ( new_col SMALLINT )
AS SELECT * FROM my_table WHERE k = 100;
-- 映射到一个SQL查询
CREATE VIEW my_view_on_view
AS SELECT * FROM my_view WHERE new_col > 70;
参考创建语句:
-- 创建MOMO_CHAT:MSG视图
create view if not exists "MOMO_CHAT". "MSG" (
"pk" varchar primary key, -- 指定ROWKEY映射到主键
"C1"."msg_time" varchar,
"C1"."sender_nickyname" varchar,
"C1"."sender_account" varchar,
"C1"."sender_sex" varchar,
"C1"."sender_ip" varchar,
"C1"."sender_os" varchar,
"C1"."sender_phone_type" varchar,
"C1"."sender_network" varchar,
"C1"."sender_gps" varchar,
"C1"."receiver_nickyname" varchar,
"C1"."receiver_ip" varchar,
"C1"."receiver_account" varchar,
"C1"."receiver_os" varchar,
"C1"."receiver_phone_type" varchar,
"C1"."receiver_network" varchar,
"C1"."receiver_gps" varchar,
"C1"."receiver_sex" varchar,
"C1"."msg_type" varchar,
"C1"."distance" varchar,
"C1"."message" varchar
);
20210308
phoenix的jdbc连接
package momo_chat.service.impl;
import momo_chat.entity.MSG;
import java.io.IOException;
import java.sql.*;
import java.util.ArrayList;
import java.util.List;
public class PhoenixChatMessageService implements ChatMessageService{
private Connection connection;
public PhoenixChatMessageService() throws ClassNotFoundException, SQLException {
Class.forName(PhoenixChatMessageService.class.getName());
connection= DriverManager.getConnection("jdbc:phoenix:node1:2181");
}
@Override
public List<MSG> getMessage(String date, String sender, String receiver) throws Exception {
String sql="select * from MOMO_CHAT.MSG limit ? offset ?";
PreparedStatement preparedStatement = connection.prepareStatement(sql);
preparedStatement.setInt(1,3);
preparedStatement.setInt(2,5);
ResultSet resultSet=preparedStatement.executeQuery();
List<MSG> objects = new ArrayList<>();
while (resultSet.next()){
MSG msg=new MSG();
msg.setMsg_time(resultSet.getString("msg_time"));
msg.setSender_nickyname(resultSet.getString("sender_nickyname"));
msg.setSender_account(resultSet.getString("sender_account"));
msg.setSender_sex(resultSet.getString("sender_sex"));
msg.setSender_ip(resultSet.getString("sender_ip"));
msg.setSender_os(resultSet.getString("sender_os"));
msg.setSender_phone_type(resultSet.getString("sender_phone_type"));
msg.setSender_network(resultSet.getString("sender_network"));
msg.setSender_gps(resultSet.getString("sender_gps"));
msg.setReceiver_nickyname(resultSet.getString("receiver_nickyname"));
msg.setReceiver_ip(resultSet.getString("receiver_ip"));
msg.setReceiver_account(resultSet.getString("receiver_account"));
msg.setReceiver_os(resultSet.getString("receiver_os"));
msg.setReceiver_phone_type(resultSet.getString("receiver_phone_type"));
msg.setReceiver_network(resultSet.getString("receiver_network"));
msg.setReceiver_gps(resultSet.getString("receiver_gps"));
msg.setReceiver_sex(resultSet.getString("receiver_sex"));
msg.setMsg_type(resultSet.getString("msg_type"));
msg.setDistance(resultSet.getString("distance"));
msg.setMessage(resultSet.getString("message"));
objects.add(msg);
}
resultSet.close();
preparedStatement.close();
return objects;
}
@Override
public void close() throws IOException, SQLException {
connection.close();
}
}
20210309
全局索引
创建语法:
CREATE INDEX 索引名称 ON 表名 (列名1, 列名2, 列名3...)
本地索引
CREATE local INDEX 索引名称 ON 表名 (列名1, 列名2, 列名3...)
覆盖索引
CREATE INDEX my_index ON my_table (v1,v2) INCLUDE(v3)
函数索引
CREATE INDEX UPPER_NAME_IDX ON EMP (UPPER(FIRST_NAME||' '||LAST_NAME))
-- 以下查询会走索引
SELECT EMP_ID FROM EMP WHERE UPPER(FIRST_NAME||' '||LAST_NAME)='JOHN DOE'
删除全局索引
drop index IDX_ORDER_DTL_DATE on ORDER_DTL;
删除本地索引
drop index LOCAL_IDX_ORDER_DTL on ORDER_DTL;
追踪查询
explain select * from ORDER_DTL where "user_id" = '8237476';
强制使用索引
explain select /*+ INDEX(ORDER_DTL GBL_IDX_ORDER_DTL) */ * from ORDER_DTL where USER_ID = '8237476';
7.3.4 使用Phoenix建立二级索引高效查询
7.3.4.1 创建本地函数索引
CREATE LOCAL INDEX LOCAL_IDX_MOMO_MSG ON MOMO_CHAT.MSG(substr("msg_time", 0, 10), "sender_account", "receiver_account");
7.3.4.2 执行数据查询
SELECT * FROM "MOMO_CHAT"."MSG" T
WHERE substr("msg_time", 0, 10) = '2020-08-29'
AND T."sender_account" = '13504113666'
AND T."receiver_account" = '18182767005' LIMIT 100;
3.24
安装scala
直接解压
tar -zxvf scala-2.12.5.tgz
设置环境变量
export SCALA_HOME=/root/scala/scala-2.12.5
export PATH=/root/scala/scala-2.12.5/bin:$PATH
安装spark
解压
tar -zxvf spark-2.4.7-bin-hadoop2.7.tar.gz
进入conf目录
cd conf
修改配置文件slaves写入从节点
vim slaves
修改spark-env.sh
vim spark-env.sh
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64/jre
export SCALA_HOME=/root/scala/scala-2.12.5
export HADOOP_HOME=/root/hadoop/hadoop-2.7.5
export HADOOP_CONF_DIR=/root/hadoop/hadoop-2.7.5/etc/hadoop
export SPARK_MASTER_IP=node1
修改log4j.properties
log4j.rootCategory=WARN, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
修改spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://node1:8020/spark/eventlogs/
spark.history.fs.logDirectory hdfs://node1:8020/spark/eventlogs
spark.eventLog.compress true
scp到从节点
主节点启动start-all.sh
3.27
javaapi连接spark
在resource下放入文件core-site.xml和hdfs-site.xml,log4j.properties
代码
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object spark {
def main(args: Array[String]): Unit = {
var sparkConf:SparkConf=new SparkConf().setAppName("spark").setMaster("local[2]")
val sc:SparkContext=new SparkContext(sparkConf)
val inputRDD: RDD[String] =sc.textFile(path = "/2019electives/input/2019electives.csv")
var resultRDD=inputRDD.flatMap(line =>line.split(",")).map(word =>(word,1)).reduceByKey(_ + _)
// resultRDD.foreach(tuple => println(tuple))
// resultRDD.saveAsTextFile(path = "/datas/ideaspark")
resultRDD.map(tuple =>tuple.swap).sortByKey(ascending = false).take(num = 3).foreach(tuple =>println(tuple))
resultRDD.sortBy(tuple => tuple._2, ascending = false)
.take(3)
.foreach(tuple =>println(tuple))
sc.stop()
}
}
2021.04.11
练习案例:搜狗搜索数据分析
bean代码
package bean
case class SogouRecord (
queryTime:String,
userId:String,
queryWords:String,
resultRank:Int,
clickRank:Int,
clickUrl:String
)
主类代码
package service
import bean.SogouRecord
import com.hankcs.hanlp.HanLP
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
object SougouMap {
def main(args: Array[String]): Unit = {
val sc:SparkContext={
val sparkConf:SparkConf=new SparkConf()
.setAppName(this.getClass.getSimpleName.stripSuffix("$"))
.setMaster("local[2]")
SparkContext.getOrCreate(sparkConf)
}
val sogouRDD= sc.textFile(path = "data/sogou/SogouQ.txt")
// println(sogouRDD.count())
// sogouRDD.foreach(word=>println(word))
val eltRDD=sogouRDD
.filter{line=>null !=line && line.trim.split("\\s+").length==6
}
.mapPartitions{iter=>
iter.map{line=>
val array=line.trim.split("\\s+")
SogouRecord(
array(0),
array(1),
array(2).replace("[","").replace("]",""),
array(3).toInt,
array(4).toInt,
array(5)
)
}
}
eltRDD.persist(StorageLevel.MEMORY_AND_DISK)
// eltRDD.foreach(line=>println(line))
// println(eltRDD.first())
// 搜索关键词统计
val recordRdd=eltRDD
.filter(record=> null != record.queryWords && record.queryWords.trim.length>0)
.flatMap{record=>
val queryWords=record.queryWords
val segment = HanLP.segment(queryWords)
import scala.collection.JavaConverters._
segment.asScala.map{term=>(term.word,1)}
}
.reduceByKey(_+_)
// recordRdd.foreach{word=>println(word)}
recordRdd.sortBy(tuple=>tuple._2,ascending = false)
.take(1)
.foreach(println)
// println(recordRdd.count())
// 用户搜索点击统计
val preuserRDD=eltRDD.mapPartitions{iter=>
iter.map{record=>
val userId=record.userId
val querword=record.queryWords
((userId,querword),1)
}
}
.reduceByKey(_+_)
val restRDD = preuserRDD.map(tuple=>tuple._2)
restRDD.take(10).foreach(println)
println(restRDD.max())
// 搜索时间段统计 按小时统计
val s=eltRDD.map{record=>
val hourstr=record.queryTime.substring(0,2)
(hourstr,1)
}
.reduceByKey(_+_)
// .sortBy(tuple=>tuple._2,ascending = false)
.top(24)(Ordering.by(tupele=>tupele._2))
.foreach(println)
eltRDD.unpersist()
sc.stop()
}
}
pom依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>spark</artifactId>
<groupId>spark</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>demo4</artifactId>
<repositories>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.com/nexus/content/groups/public</url>
</repository>
</repositories>
<properties>
<scala.version>2.11.12</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<spark.version>2.4.5</spark.version>
<hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
<hbase.version>1.2.0-cdh5.16.2</hbase.version>
<mysql.version>8.0.19</mysql.version>
</properties>
<dependencies>
<!-- 依赖Scala语言 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- Spark Core 依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Spark SQL 依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Hadoop Client 依赖 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- HBase Client 依赖 -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-hadoop2-compat</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<!-- MySQL Client 依赖 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.hankcs/hanlp -->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.7</version>
</dependency>
</dependencies>
<build>
<outputDirectory>target/classes</outputDirectory>
<testOutputDirectory>target/test-classes</testOutputDirectory>
<resources>
<resource>
<directory>${project.basedir}/src/main/resources</directory>
</resource>
</resources>
<!-- Maven 编译的插件 -->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>