概要
提示:这里可以添加技术概要
例如:
openAI 的 GPT 大模型的发展历程。
整体架构流程
提示:这里可以添加技术整体架构
例如:
在语言模型中,编码器和解码器都是由一个个的 Transformer 组件拼接在一起形成的。
技术名词解释
提示:这里可以添加技术名词解释
例如:
- Bert
- GPT 初代
- GPT-2
- GPT-3
- ChatGPT
技术细节
提示:这里可以添加技术细节
例如:
- API
- 支持模型类型
小结
提示:这里可以添加总结
例如:
提供先进的推理,复杂的指令,更多的创造力。
package cn.itcast.hbase.zx;
import com.twitter.chill.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
//import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog;
import scala.Tuple2;
public class zzbhbase {
public static void main(String[] args) throws Exception {
// 创建Spark配置
SparkConf sparkConf = new SparkConf().setAppName("Java Spark HBase Reader").setMaster("local[*]");
// 创建SparkContext和SparkSession
JavaSparkContext sc = new JavaSparkContext(sparkConf);
SparkSession spark = SparkSession.builder().getOrCreate();
String tableName = "users";
String FAMILY = "personal";
String COLUM_ID = "id";
String COLUM_NAME = "name";
String COLUM_PHONE = "phone";
// 创建HBase配置
Configuration hbaseConf = HBaseConfiguration.create();
hbaseConf.set("hbase.zookeeper.quorum", "localhost");
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181");
hbaseConf.set("hbase.master", "localhost:60000");
hbaseConf.set("hbase.rootdir", "hdfs://localhost:9000/hbase");
hbaseConf.set(TableInputFormat.INPUT_TABLE, tableName);
Scan scan = new Scan();
scan.setCaching(100);
scan.setCacheBlocks(false);
// 可以添加过滤条件等
// scan.setFilter(...);
scan.addFamily(Bytes.toBytes(FAMILY));
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_ID));
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_NAME));
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_PHONE));
//添加scan
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
String ScanToString = Base64.encodeBytes(proto.toByteArray());
hbaseConf.set(TableInputFormat.SCAN, ScanToString);
// TableName tableName2 = TableName.valueOf("your_table_name");
Connection connection = ConnectionFactory.createConnection(hbaseConf);
Table table = connection.getTable(TableName.valueOf("yourTableName"));
ResultScanner scanner = table.getScanner(scan);
// try {
JavaPairRDD<ImmutableBytesWritable, Result> hBaseRDD =
sc.newAPIHadoopRDD(hbaseConf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
//读HBase数据转化成RDD
JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD =
sc.newAPIHadoopRDD(hbaseConf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
hbaseRDD.cache();// 对myRDD进行缓存
long count = hbaseRDD.count();
System.out.println("数据总条数:" + count);
// 将Result对象转换为Java对象或自定义类型
JavaRDD<YourHBaseRowClass> convertedRDD = (JavaRDD<YourHBaseRowClass>) hBaseRDD.map(tuple2 -> {
// 根据你的HBase表结构和需要读取的列来解析Result对象
Result result = tuple2._2();
String rowKey = Bytes.toString(result.getRow());
String columnValue = Bytes.toString(result.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("column1Name")));
// ...解析其他列的值
return new YourHBaseRowClass(rowKey, columnValue); // 假设YourHBaseRowClass是你的自定义类
});
JavaRDD<String> maprdd = (JavaRDD<String>) hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, String>() {
@Override
public String call(Tuple2<ImmutableBytesWritable, Result> tuple2) throws Exception {
Result result = tuple2._2();
String rowKey = Bytes.toString(result.getRow());
String id = Bytes.toString(result.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_ID)));
String name = Bytes.toString(result.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_NAME)));
String mergy = rowKey + "\t" + id + "\t" + name;
//return RowFactory.create(rowKey, id, name);
return mergy;
}
});
maprdd.saveAsTextFile("hdfs://********:8020/tmp/test");
hbaseRDD.unpersist();
//
// } catch (Exception e) {
// e.printStackTrace();
// } finally {
// }
// 转换为Dataset<Row>以使用Spark SQL功能(如果需要)
DataFrame dataFrame = spark.createDataFrame(convertedRDD, YourHBaseRowClass.class);
//Dataset<Row> df = spark.createDataFrame(convertedRDD, YourHBaseRowClass.class);
// 显示DataFrame内容
dataFrame.show();
// 4. 读取HBase数据为DataFrame(假设你使用的是支持DataFrame的连接器)
// 注意:这里的代码可能因连接器版本而异
// Dataset hbaseDF = spark.read()
// .format(“org.apache.spark.sql.execution.datasources.hbase”) // 连接器的格式
// .option(“hbase.table”, “your_table_name”) // HBase表名
// .option(“hbase.zookeeper.quorum”, “your_zookeeper_quorum”) // 可以省略,如果已经在hbaseConf中设置
// .option(“hbase.zookeeper.property.clientPort”, “2181”) // 可以省略,如果已经在hbaseConf中设置
// .load();
//
// // 5. 处理数据
// hbaseDF.show(); // 或者其他Spark操作
// 停止SparkContext和SparkSession
Base64.Encoder base64Encoder = Base64.getEncoder();
String sparkMaster = "local";
String zkQuorum = "master";
String zkClientPort = "2181";
String tableName = "KITTI";
byte[] startRowKey = Bytes.toBytes("0r");
byte[] endRowKey = Bytes.toBytes("9r");
SparkSession sparkSession = SparkSession.builder().master(sparkMaster).getOrCreate();
Configuration hconf= HBaseConfiguration.create();
hconf.set("hbase.zookeeper.property.clientPort", zkClientPort);
hconf.set("hbase.zookeeper.quorum",zkQuorum);
//设置读取HBase表的名称
hconf.set(TableInputFormat.INPUT_TABLE, tableName);
//设置读取HBase表scan的数据范围
Scan scan = new Scan();
scan.withStartRow(startRowKey, true);
scan.withStopRow(endRowKey, true);
hconf.set(TableInputFormat.SCAN, base64Encoder.encodeToString(ProtobufUtil.toScan(scan).toByteArray()));
JavaRDD<Tuple2<ImmutableBytesWritable, Result>> hbaseRDD = sparkSession.sparkContext()
.newAPIHadoopRDD(hconf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class).toJavaRDD();
sc.close();
spark.stop();
}
// 自定义类来表示HBase中的一行数据
public static class YourHBaseRowClass {
private String rowKey;
private String columnValue;
public YourHBaseRowClass(String rowKey, String columnValue) {
this.rowKey = rowKey;
this.columnValue = columnValue;
}
// Getter和Setter方法
public String getRowKey() {
return rowKey;
}
public void setRowKey(String rowKey) {
this.rowKey = rowKey;
}
public String getColumnValue() {
return columnValue;
}
public void setColumnValue(String columnValue) {
this.columnValue = columnValue;
}
// 重写toString方法以便于显示
@Override
public String toString() {
return "YourHBaseRowClass{" +
"rowKey='" + rowKey + '\'' +
", columnValue='" + columnValue + '\'' +
'}';
}
}
}