目的:从hbase的一个库里面读取,然后进行修改后,在放到一个新库中,新库叫做2_lib(可以自己ctrl+F搜)
一次读取1w1个rowkey,中间开启了500个线程,每个线程插入20个rowkey,每个rowkey里面又有x个版本cell。(1个row中有20w个cell)
我选择了4台机器,根据rowkey会自动排序,我的rowkey是md5,分成了16个大批,每台机器负责4个批次。总共是2000个线程,插入速度达到300万/s。千亿数据,4台机器4天跑完。
再以rowkey的最后1个末尾,作为起始row
有的没的,我都用上了。大家可以自行删减,pom:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>insertApi</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
<name>insertHbaseKu-v1.0</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>2.6.5</hadoop.version>
<hbase.version>1.7.0</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--org.apache.hadoop.hbase.mapreduce-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.24</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.31</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.4.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.10</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.4.1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>1.5</version>
</dependency>
<dependency>
<groupId>net.sf.sevenzipjbinding</groupId>
<artifactId>sevenzipjbinding</artifactId>
<version>9.20-2.00beta</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-artifact</artifactId>
<version>3.6.3</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.ant</groupId>
<artifactId>ant</artifactId>
<version>1.10.5</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.1.1</version>
</dependency>
<dependency>
<groupId>com.github.zafarkhaja</groupId>
<artifactId>java-semver</artifactId>
<version>0.9.0</version>
</dependency>
<!-- 组件解析依赖-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.30</version>
</dependency>
<!--<dependency>-->
<!--<groupId>cn.hutool</groupId>-->
<!--<artifactId>hutool-all</artifactId>-->
<!--<version>5.1.1</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.7.0-M1</version>
</dependency>
<dependency>
<groupId>com.moandjiezana.toml</groupId>
<artifactId>toml4j</artifactId>
<version>0.7.2</version>
</dependency>
<dependency>
<groupId>org.yaml</groupId>
<artifactId>snakeyaml</artifactId>
<version>1.26</version>
</dependency>
</dependencies>
</project>
目录结构:
Worker:
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
public class Worker implements Runnable
{
private CountDownLatch downLatch;
private Integer name;
private Connection hbaseConnection;
private ArrayList<String> rows;//他负责要处理的rows
public Worker(CountDownLatch downLatch, Integer name, Connection hbaseConnection, ArrayList<String> rows) {
this.downLatch = downLatch;
this.name = name;
this.hbaseConnection = hbaseConnection;
this.rows = rows;
}
@Override
public void run()
{
this.threadInser(hbaseConnection,rows);
// System.out.println(this.name + "活干完了!");
this.downLatch.countDown();
if(downLatch.getCount()==400){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
if(downLatch.getCount()==300){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
if(downLatch.getCount()==200){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
if(downLatch.getCount()==100){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
}
public static void threadInser(Connection hbaseConnection,ArrayList<String> rowKeys){
try {
Table tableR = hbaseConnection.getTable(TableName.valueOf("0_library_token"));
Table tableW = hbaseConnection.getTable(TableName.valueOf("2_library_token"));
for(String row:rowKeys){
Get get = new Get(Bytes.toBytes(row)).setMaxVersions(1111111)
.addColumn(Bytes.toBytes("F"), Bytes.toBytes("F"));
Cell[] cells = tableR.get(get).rawCells();
for(Cell cell:cells){
String jsonstr = Bytes.toString(CellUtil.cloneValue(cell));
JSONObject jsonObject = JSONObject.parseObject(jsonstr);
String[] hbasePathStrs = jsonObject.getString("path").split("/");
String projectVersion = new StringBuffer().append(hbasePathStrs[1]).append("/").append(hbasePathStrs[3]).toString();//消耗少量内存
Put put = new Put(row.getBytes()); //指定rowkey
put.addColumn("F".getBytes(), "F".getBytes(), projectVersion.getBytes());
tableW.put(put);
}
}
tableR.close();
tableW.close();
} catch (IOException e) {
// e.printStackTrace();
}
}
}
LinuxKudu(主类)
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class LinuxKernelKu {
static Connection hbaseConnection;
static String rowStart="0";
public static void main(String[] args) {
//初始化Hbase并且创建600个线程的连接池
//hbase 数据获取
Configuration HBASE_CONF;
HBASE_CONF = HBaseConfiguration.create();
HBASE_CONF.set("hbase.zookeeper.property.clientPort", "2181");
HBASE_CONF.set("hbase.zookeeper.quorum", "192.168.xx.xx");
HBASE_CONF.set("hbase.master", "192.168.xx.xx:60000");
HBASE_CONF.set("zookeeper.znode.parent", "/hbase");
HBASE_CONF.setInt("hbase.hconnection.threads.max", 600);
HBASE_CONF.setInt("hbase.hconnection.threads.core", 600);
HBASE_CONF.setLong("hbase.hconnection.threads.keepalivetime", 1000);
try {
hbaseConnection = ConnectionFactory.createConnection(HBASE_CONF);//初始化连接池
//设置起始row,修改起始row
while (true) {
boolean b = cpNewLibaryToken();
String temp_row=rowStart;
if(b==false){
System.out.println("异常,请删除这个rowStart后的1w行,row为"+rowStart);
break;
}
if(rowStart.substring(0, 1).equals("c")){
System.out.println("我跑完了,跑完的rowStart为"+rowStart);
break;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
//多线程方法
//Hbase建库(小文件库)
//传入rowStart,hbase连接
//TODO 这个程序的后续版本在E盘的insertApi文件夹中
public static boolean cpNewLibaryToken() {
Instant inst1 = Instant.now();
ExecutorService executor = Executors.newCachedThreadPool();
CountDownLatch dLatch = new CountDownLatch(500);
//rowkey
//选取你scan的第一个rowkey的第一个字符
String endRow="f";
System.out.println("连接hbase");
System.out.println("开始读取"+rowStart+"后的1w个rowKey");
Scan scan = new Scan().withStartRow(rowStart.getBytes())
.setMaxVersions(99999999)
.addColumn(Bytes.toBytes("F"), Bytes.toBytes("F"))
.setLimit(10001);
Table tableRead = null;
try {
tableRead = hbaseConnection.getTable(TableName.valueOf("0_library_token"));
ResultScanner scanner = tableRead.getScanner(scan);
ArrayList<String> rows = new ArrayList<>();
//TODO 取rowkeyS
for(Result rs:scanner){
String row = Bytes.toString(rs.getRow());
rows.add(row);
}
endRow = rows.get(rows.size()-1);
rows.remove(rows.size()-1);
System.out.println("rows的长度现在是"+rows.size());
//结束的rowKey,他是下一批次的startRow,固在上面那个阶段排除,不处理.
System.out.println("我是最后一个row"+endRow);
//1w个rowkey,500个线程,1个线程20的rowkey
//TODO 分key,将rowkey分成各20个为1组,500组对应500个线程
//key是具体的0.1.2.3线程名称
HashMap<Integer, ArrayList<String>> threadMap = new HashMap<>();
int mapKey=1;//线程的key
int i=1;//每个row的计数量
ArrayList<String> rows1 = new ArrayList<>();//作为单个线程的初始化组
for(String row:rows){
rows1.add(row);
if(i%20==0){//每20个为1组
threadMap.put(mapKey,rows1);
rows1=new ArrayList<>();
mapKey++;
}
i++;
}
//TODO 线程处理
for(int workerName:threadMap.keySet()){//线程是谁
Worker worker = new Worker(dLatch,workerName,hbaseConnection,threadMap.get(workerName));
executor.execute(worker);
}
dLatch.await();//等待其他所有子线程执行完毕再执行
executor.shutdown();
rowStart=endRow;
Instant inst2 = Instant.now();
System.out.println("******以秒计的时间差:" + Duration.between(inst1, inst2).getSeconds());
return true;
} catch (Exception e) {
// e.printStackTrace();
System.out.println("发生异常的rowKey为"+rowStart);
return false;
}
}
}
如果中间报错,根据rowKey,scan后,删除对应的1w条数据。
删除方法DeleteRow
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class DeleteRow {
static Connection hbaseConnection;
static String rowStart="269eba6e48e4c05afd370b59dbd94ddc";//你要扫描的rowkey(这个可以根据上面的insert插入方法的打印信息截取)
//你如果是需要截取中间的,则加一个rowEnd,我这里没有加
public static void main(String[] args) {
Configuration HBASE_CONF;
HBASE_CONF = HBaseConfiguration.create();
HBASE_CONF.set("hbase.zookeeper.property.clientPort", "2181");
HBASE_CONF.set("hbase.zookeeper.quorum", "192.168.31.71");
HBASE_CONF.set("hbase.master", "192.168.31.71:60000");
HBASE_CONF.set("zookeeper.znode.parent", "/hbase");
HBASE_CONF.setInt("hbase.hconnection.threads.max", 600);
HBASE_CONF.setInt("hbase.hconnection.threads.core", 600);
HBASE_CONF.setLong("hbase.hconnection.threads.keepalivetime", 1000);
try {
hbaseConnection = ConnectionFactory.createConnection(HBASE_CONF);//初始化连接池
//设置起始row,修改起始row
System.out.println("连接hbase");
System.out.println("开始读取"+rowStart+"后的1w个rowKey");
//下面该根据row扫描了,并且delteAll 这个row
Scan scan = new Scan().withStartRow(rowStart.getBytes())
.setMaxVersions(99999999)
.addColumn(Bytes.toBytes("F"), Bytes.toBytes("F"))
.setLimit(10001);
Table tableRead = hbaseConnection.getTable(TableName.valueOf("2_library_token"));
ResultScanner scanner = tableRead.getScanner(scan);
HashSet<String> rowKeySet = new HashSet<>();
for(Result rs:scanner){
String row = Bytes.toString(rs.getRow());
if(!row.isEmpty()){
rowKeySet.add(row);
}
}
System.out.println(rowKeySet.size());
List deletes = new ArrayList();
for(String rowkey:rowKeySet){
Delete delete = new Delete(Bytes.toBytes(rowkey));
deletes.add(delete);
}
tableRead.delete(deletes);
} catch (IOException e) {
e.printStackTrace();
}
}
}
2021/12/27日
补充:多个机器,采用redis对每个不同机器进行记录,如果机器中有任何一个线程发生了错误,那么交由 其中一台(主机器-代码有所不同),去进行删除,其他机器循环定时调用redis。如果删除完毕后,主机器将redis的值进行改变。在进行重跑。这是新更新的代码逻辑。