【无标题】

最新推荐文章于 2024-10-16 10:13:45 发布

冬始夏沫

最新推荐文章于 2024-10-16 10:13:45 发布

阅读量691

点赞数 10

文章标签： java

本文链接：https://blog.csdn.net/zhangwei0941/article/details/139638912

版权

文章目录

概要

提示：这里可以添加技术概要

例如：

openAI 的 GPT 大模型的发展历程。

整体架构流程

提示：这里可以添加技术整体架构

例如：
在语言模型中，编码器和解码器都是由一个个的 Transformer 组件拼接在一起形成的。

技术名词解释

提示：这里可以添加技术名词解释

例如：

Bert
GPT 初代
GPT-2
GPT-3
ChatGPT

技术细节

提示：这里可以添加技术细节

例如：

API
支持模型类型

小结

提示：这里可以添加总结

例如：

提供先进的推理，复杂的指令，更多的创造力。
package cn.itcast.hbase.zx;

import com.twitter.chill.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
//import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog;
import scala.Tuple2;

public class zzbhbase {

public static void main(String[] args) throws Exception {
    // 创建Spark配置
    SparkConf sparkConf = new SparkConf().setAppName("Java Spark HBase Reader").setMaster("local[*]");
    // 创建SparkContext和SparkSession
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

   SparkSession spark = SparkSession.builder().getOrCreate();

    String tableName = "users";
    String FAMILY = "personal";
    String COLUM_ID = "id";
    String COLUM_NAME = "name";
    String COLUM_PHONE = "phone";

    // 创建HBase配置
    Configuration hbaseConf = HBaseConfiguration.create();
    hbaseConf.set("hbase.zookeeper.quorum", "localhost");
    hbaseConf.set("hbase.zookeeper.property.clientPort", "2181");
    hbaseConf.set("hbase.master", "localhost:60000");
    hbaseConf.set("hbase.rootdir", "hdfs://localhost:9000/hbase");
    hbaseConf.set(TableInputFormat.INPUT_TABLE, tableName);


    Scan scan = new Scan();
    scan.setCaching(100);
    scan.setCacheBlocks(false);
    // 可以添加过滤条件等
    // scan.setFilter(...);

    scan.addFamily(Bytes.toBytes(FAMILY));
    scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_ID));
    scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_NAME));
    scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_PHONE));

    //添加scan
    ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
    String ScanToString = Base64.encodeBytes(proto.toByteArray());
    hbaseConf.set(TableInputFormat.SCAN, ScanToString);
   // TableName tableName2 = TableName.valueOf("your_table_name");

    Connection connection = ConnectionFactory.createConnection(hbaseConf);
    Table table = connection.getTable(TableName.valueOf("yourTableName"));
    ResultScanner scanner = table.getScanner(scan);

// try {
JavaPairRDD<ImmutableBytesWritable, Result> hBaseRDD =
sc.newAPIHadoopRDD(hbaseConf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

    //读HBase数据转化成RDD
    JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD =
            sc.newAPIHadoopRDD(hbaseConf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

    hbaseRDD.cache();// 对myRDD进行缓存

    long count = hbaseRDD.count();
    System.out.println("数据总条数：" + count);


    // 将Result对象转换为Java对象或自定义类型


    JavaRDD<YourHBaseRowClass> convertedRDD = (JavaRDD<YourHBaseRowClass>) hBaseRDD.map(tuple2 -> {
        // 根据你的HBase表结构和需要读取的列来解析Result对象
        Result result = tuple2._2();
        String rowKey = Bytes.toString(result.getRow());
        String columnValue = Bytes.toString(result.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("column1Name")));
        // ...解析其他列的值
        return new YourHBaseRowClass(rowKey, columnValue); // 假设YourHBaseRowClass是你的自定义类
    });


    JavaRDD<String> maprdd = (JavaRDD<String>) hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, String>() {
        @Override
        public String call(Tuple2<ImmutableBytesWritable, Result> tuple2) throws Exception {
            Result result = tuple2._2();
            String rowKey = Bytes.toString(result.getRow());
            String id = Bytes.toString(result.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_ID)));
            String name = Bytes.toString(result.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_NAME)));
            String mergy = rowKey + "\t" + id + "\t" + name;
            //return RowFactory.create(rowKey, id, name);
            return mergy;
        }

    });

    maprdd.saveAsTextFile("hdfs://********:8020/tmp/test");

  hbaseRDD.unpersist();

//
// } catch (Exception e) {
// e.printStackTrace();
// } finally {
// }

    // 转换为Dataset<Row>以使用Spark SQL功能（如果需要）
    DataFrame dataFrame = spark.createDataFrame(convertedRDD, YourHBaseRowClass.class);
    //Dataset<Row> df = spark.createDataFrame(convertedRDD, YourHBaseRowClass.class);

    // 显示DataFrame内容
    dataFrame.show();


    // 4. 读取HBase数据为DataFrame（假设你使用的是支持DataFrame的连接器）
    // 注意：这里的代码可能因连接器版本而异

// Dataset hbaseDF = spark.read()
// .format(“org.apache.spark.sql.execution.datasources.hbase”) // 连接器的格式
// .option(“hbase.table”, “your_table_name”) // HBase表名
// .option(“hbase.zookeeper.quorum”, “your_zookeeper_quorum”) // 可以省略，如果已经在hbaseConf中设置
// .option(“hbase.zookeeper.property.clientPort”, “2181”) // 可以省略，如果已经在hbaseConf中设置
// .load();
//
// // 5. 处理数据
// hbaseDF.show(); // 或者其他Spark操作

    // 停止SparkContext和SparkSession


    Base64.Encoder base64Encoder = Base64.getEncoder();
    String sparkMaster = "local";
    String zkQuorum = "master";
    String zkClientPort = "2181";
    String tableName = "KITTI";
    byte[] startRowKey = Bytes.toBytes("0r");
    byte[] endRowKey = Bytes.toBytes("9r");

    SparkSession sparkSession = SparkSession.builder().master(sparkMaster).getOrCreate();

    Configuration hconf= HBaseConfiguration.create();
    hconf.set("hbase.zookeeper.property.clientPort", zkClientPort);
    hconf.set("hbase.zookeeper.quorum",zkQuorum);

//设置读取HBase表的名称
hconf.set(TableInputFormat.INPUT_TABLE, tableName);

    //设置读取HBase表scan的数据范围
    Scan scan = new Scan();
    scan.withStartRow(startRowKey, true);
    scan.withStopRow(endRowKey, true);
    hconf.set(TableInputFormat.SCAN, base64Encoder.encodeToString(ProtobufUtil.toScan(scan).toByteArray()));

    JavaRDD<Tuple2<ImmutableBytesWritable, Result>> hbaseRDD = sparkSession.sparkContext()
            .newAPIHadoopRDD(hconf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class).toJavaRDD();




    sc.close();
    spark.stop();
}

// 自定义类来表示HBase中的一行数据
public static class YourHBaseRowClass {
    private String rowKey;
    private String columnValue;

    public YourHBaseRowClass(String rowKey, String columnValue) {
        this.rowKey = rowKey;
        this.columnValue = columnValue;
    }

    // Getter和Setter方法
    public String getRowKey() {
        return rowKey;
    }

    public void setRowKey(String rowKey) {
        this.rowKey = rowKey;
    }

    public String getColumnValue() {
        return columnValue;
    }

    public void setColumnValue(String columnValue) {
        this.columnValue = columnValue;
    }

    // 重写toString方法以便于显示
    @Override
    public String toString() {
        return "YourHBaseRowClass{" +
                "rowKey='" + rowKey + '\'' +
                ", columnValue='" + columnValue + '\'' +
                '}';
    }
}

}