<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hbase</groupId>
<artifactId>HbaseData</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<spark.version>2.3.2</spark.version>
<commons-io.version>2.6</commons-io.version>
<jdk.version>1.8</jdk.version>
<maven.compiler.target>1.8</maven.compiler.target>
<hanlp.version>portable-1.6.8</hanlp.version>
<junit.version>4.12</junit.version>
<hbase.version>1.3.1</hbase.version>
<hadoop.version>3.1.1</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons-io.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.powermock</groupId>
<artifactId>powermock-module-junit4</artifactId>
<version>2.0.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.powermock</groupId>
<artifactId>powermock-api-mockito2</artifactId>
<version>2.0.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>2.28.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-protocol</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-app</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.13</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package com.hbase;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class HbaseValuesUtils {
private static byte[] getValueByte(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value;
}
private static String getValueProto(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
String res = "";
if (value == null) {
return res;
}
try {
res = URLEncoder.encode(new String(value, StandardCharsets.ISO_8859_1), "UTF-8");
} catch (UnsupportedEncodingException e) {
}
return res;
}
private static String getTimestamp(Result result, String family, String qualifier) {
Cell cell = result.getColumnLatestCell(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return cell == null ? "" : Long.toString(cell.getTimestamp());
}
private static String getValueStr(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
String valueStr = value == null ? "" : Bytes.toString(value);
valueStr = valueStr.replaceAll("\t", " ");
return valueStr;
}
private static String getValueStrEncoded(Result result, String family, String qualifier) {
String valueStr = getValueStr(result, family, qualifier);
String res = "";
try {
res = URLEncoder.encode(valueStr, "UTF-8");
} catch (UnsupportedEncodingException e) {
}
return res;
}
private static String getValueBoolean(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value == null ? "" : String.valueOf(Bytes.toBoolean(value));
}
private static String getValueShort(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value == null ? "" : String.valueOf(Bytes.toShort(value));
}
private static String getValueInt(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value == null ? "" : String.valueOf(Bytes.toInt(value));
}
private static String getValueLong(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value == null ? "" : String.valueOf(Bytes.toLong(value));
}
private static String getValueFloat(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value == null ? "" : String.valueOf(Bytes.toFloat(value));
}
private static String getValueDouble(Result result, String family, String qualifier) {
byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
return value == null ? "" : String.valueOf(Bytes.toDouble(value));
}
private static String getValueIntOrLong(Result result, String family, String qualifier) {
byte[] numLongOrInt = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier));
String temp;
if (numLongOrInt == null) {
return "";
}
try {
if (numLongOrInt.length == 2) {
temp = String.valueOf(Bytes.toInt(numLongOrInt));
} else if (numLongOrInt.length == 4) {
temp = String.valueOf(Bytes.toLong(numLongOrInt));
} else {
temp = "";
}
} catch (Exception e) {
temp = "";
}
return temp;
}
public static String getValue(Result result, String family, String qualifier, String dataType) {
String value = "";
switch (dataType.toLowerCase()) {
case "string":
value = getValueStr(result, family, qualifier);
break;
case "boolean":
value = getValueBoolean(result, family, qualifier);
break;
case "short":
value = getValueShort(result, family, qualifier);
break;
case "int":
value = getValueInt(result, family, qualifier);
break;
case "long":
value = getValueLong(result, family, qualifier);
break;
case "float":
value = getValueFloat(result, family, qualifier);
break;
case "double":
value = getValueDouble(result, family, qualifier);
break;
case "intorlong":
value = getValueIntOrLong(result, family, qualifier);
break;
case "timestamp":
value = getTimestamp(result, family, qualifier);
break;
case "stringencoded":
value = getValueStrEncoded(result, family, qualifier);
break;
case "protobuf":
value = getValueProto(result, family, qualifier);
break;
default:
}
return value;
}
private void getList(String urlPath, String tablename, String outputPath) {
SparkConf sparkConf = new SparkConf().setAppName("get-list");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
JavaRDD<String> rowkeyRdd = jsc.textFile(urlPath)
.map(x -> DigestUtils.md5Hex(x)).sortBy(x -> x, true, 1000)
.persist(StorageLevel.MEMORY_AND_DISK_SER());
rowkeyRdd.foreachPartition(eachPartition -> {
List<String> pageList = new ArrayList<String>(1000);
Configuration hbaseConfig = HBaseConfiguration.create();
Connection conn = ConnectionFactory.createConnection(hbaseConfig);
Table table = conn.getTable(TableName.valueOf(tablename));
int count = 1;
try {
while (eachPartition.hasNext()) {
count++;
List<Get> gets = new ArrayList<>();
String rowkey = eachPartition.next();
Get get = new Get(Bytes.toBytes(rowkey));
get.addColumn(Bytes.toBytes("c"), Bytes.toBytes("url"));
if (count % 100 != 0) {
gets.add(get);
} else {
gets.add(get);
Result[] results = table.get(gets);
for (Result result : results) {
if (result == null) {
continue;
}
byte[] page = result.getValue(Bytes.toBytes("c"), Bytes.toBytes("url"));
if (page == null || page.length == 0) {
continue;
}
String pg = Bytes.toString(page);
pageList.add(Bytes.toString(result.getRow()) + "\t" + pg + "\n");
if (pageList.size() == 1000) {
pageList.clear();
}
}
gets.clear();
}
gets.add(get);
Result[] results = table.get(gets);
gets.clear();
}
} catch (Exception e) {
System.out.println("Exception:" + e.getMessage());
} finally {
try {
if (table != null) {
table.close();
}
conn.close();
} catch (IOException e) {
System.out.println("Exception:" + e.getMessage());
}
}
});
}
public void PutList(String putText, String tablename) {
SparkConf sparkConf = new SparkConf().setAppName("put-list");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
jsc.textFile(putText).foreachPartition(eachPartion -> {
Configuration hbaseConfig = HBaseConfiguration.create();
Connection conn = ConnectionFactory.createConnection(hbaseConfig);
Table table = conn.getTable(TableName.valueOf(tablename));
List<Put> putlist = new ArrayList<>();
try {
while (eachPartion.hasNext()) {
String inputStr = eachPartion.next();
String rowkey = inputStr.split("\t")[0];
String data = inputStr.split("\t")[1];
Put put = new Put(Bytes.toBytes(rowkey));
put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("data"),
Bytes.toBytes(data));
if (putlist.size() % 10 != 0) {
putlist.add(put);
} else {
putlist.add(put);
table.put(putlist);
putlist.clear();
}
}
table.put(putlist);
putlist.clear();
} finally {
try {
if (table != null) {
table.close();
}
conn.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
}
});
}
}
package com.hbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class HbaseScan {
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("Hbase-scan");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.driver.maxResultSize", "6g");
JavaSparkContext jsc = new JavaSparkContext(conf);
jsc.setLogLevel("ERROR");
// scan 数据
String tableName = args[0];
String output = args[1];
String startRow = args[2];
String stopRow = args[3];
JavaRDD<String> inRDD = getHbaseScanData(jsc, tableName, startRow, stopRow);
inRDD.persist(StorageLevel.MEMORY_AND_DISK());
inRDD.repartition(3000).saveAsTextFile(output);
jsc.stop();
}
// scan hbase 的入口
public static JavaRDD<String> getHbaseScanData(JavaSparkContext jsc, String tableName, String startRow, String stopRow) throws IOException {
// Long maxResult = Long.valueOf(maxResultStr);
final String[][] hbaseTableColumns = {
{"c", "a", "String"},
{"c", "b", "String"},
{"c", "c", "String"},
{"c", "d", "String"}
};
final Configuration hbConf = HBaseConfiguration.create(jsc.hadoopConfiguration());
hbConf.addResource(new Path("core-site.xml"));
hbConf.addResource(new Path("hdfs-site.xml"));
hbConf.addResource(new Path("hbase-site.xml"));
hbConf.set("hbase.regionserver.lease.period", "3600000");
hbConf.set("hbase.client.scanner.timeout.period", "3600000");
hbConf.set("hbase.rpc.timeout", "3600000");
hbConf.set("hbase.client.retries.number", "360");
hbConf.set(TableInputFormat.INPUT_TABLE, tableName);
hbConf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(scanFilter(startRow, stopRow, hbaseTableColumns)));
JavaPairRDD<ImmutableBytesWritable, Result> rdd = jsc.newAPIHadoopRDD(hbConf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
rdd.persist(StorageLevel.MEMORY_AND_DISK());
JavaRDD<String> data = rdd.repartition(3000).map((Function<Tuple2<ImmutableBytesWritable, Result>, String>) t -> {
String rowkey = Bytes.toString(t._2.getRow());
Result result = t._2;
List<String> line = new ArrayList<>(1000);
String[] rowkeyArr = rowkey.split("_");
if (rowkey.length() > 32 && rowkeyArr.length == 2) {
String rowkeyWdb = rowkeyArr[1];
line.add(rowkeyWdb);
}
line.add(rowkey);
for (String[] col : hbaseTableColumns) {
if (col.length > 2) {
String family = col[0];
String qualifier = col[1];
String dataType = col[2];
line.add(HbaseValuesUtils.getValue(result, family, qualifier, dataType));
}
}
return String.join("\t", line);
});
return data;
}
private static Scan scanFilter(String startRow, String stopRow, String[][] hbaseTableColumns) {
Scan scan = new Scan();
scan.setCaching(80);
scan.setCacheBlocks(false);
scan.setStartRow(Bytes.toBytes(startRow));
scan.setStopRow(Bytes.toBytes(stopRow));
for (String[] col : hbaseTableColumns) {
if (col.length > 2) {
String family = col[0];
String qualifier = col[1];
scan.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
}
}
// 必须保证各过滤条件满足
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
SingleColumnValueFilter nullFilter = new SingleColumnValueFilter(
Bytes.toBytes("c"), Bytes.toBytes("url"), CompareFilter.CompareOp.NOT_EQUAL, new NullComparator());
nullFilter.setFilterIfMissing(true);
filterList.addFilter(nullFilter);
// SingleColumnValueFilter langFilter = new SingleColumnValueFilter(
// Bytes.toBytes("c"), Bytes.toBytes("loc"), CompareFilter.CompareOp.EQUAL, new SubstringComparator(location));
// langFilter.setFilterIfMissing(true);
// filterList.addFilter(langFilter);
SingleColumnValueFilter layerFilter = new SingleColumnValueFilter(
Bytes.toBytes("c"), Bytes.toBytes("time"), CompareFilter.CompareOp.EQUAL, new SubstringComparator("3"));
layerFilter.setFilterIfMissing(true);
filterList.addFilter(layerFilter);
// filterList.addFilter(new RandomRowFilter(0.1f));
// filterList.addFilter(new PageFilter(maxResult));
scan.setFilter(filterList);
return scan;
}
}
package com.hbase;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class HbaseGet {
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("Hbase-get");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.driver.maxResultSize", "6g");
JavaSparkContext jsc = new JavaSparkContext(conf);
jsc.setLogLevel("ERROR");
//get数据
final String tableName = args[0];
final String input = args[1];
final String output = args[2];
final String[][] hbaseTableColumns = {
{"c", "url", "String"},
{"c", "location", "String"}
};
jsc.textFile(input).repartition(3000).mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
@Override
public Iterator<String> call(Iterator<String> stringIterator) throws Exception {
return getHbase(stringIterator, tableName, hbaseTableColumns).iterator();
}
}).coalesce(1).saveAsTextFile(output);
jsc.stop();
}
public static List<String> getHbase(Iterator<String> urls, String tableName, String[][] hbaseTableColumns) throws IOException {
List<String> res = new ArrayList<>(1000);
Configuration hbConf = HBaseConfiguration.create();
hbConf.addResource(new Path("core-site.xml"));
hbConf.addResource(new Path("hdfs-site.xml"));
hbConf.addResource(new Path("hbase-site.xml"));
Connection connection = ConnectionFactory.createConnection(hbConf);
Table ht = connection.getTable(TableName.valueOf(tableName));
while (urls.hasNext()) {
List<String> line = new ArrayList<>();
String next = urls.next();
String url = next.split("\t", -1)[0];
String rowKey = "";
rowKey = DigestUtils.md2Hex(url);
Get get;
if (rowKey.length() > 0) {
get = new Get(rowKey.getBytes());
} else {
continue;
}
for (String[] col : hbaseTableColumns) {
if (col.length > 2) {
String family = col[0];
String qualifier = col[1];
get.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
}
}
Result result = ht.get(get);
if (result.isEmpty()) {
continue;
}
line.add(url);
for (String[] col : hbaseTableColumns) {
if (col.length > 2) {
String family = col[0];
String qualifier = col[1];
String dataType = col[2];
line.add(HbaseValuesUtils.getValue(result, family, qualifier, dataType));
}
}
res.add(String.join("\t", line));
}
ht.close();
connection.close();
return res;
}
}