mongodb数据导入hbase,spark读取hbase数据分析

最新推荐文章于 2021-11-30 14:26:05 发布

weixin_34238633

最新推荐文章于 2021-11-30 14:26:05 发布

阅读量476

点赞数

文章标签：大数据数据库 scala

原文链接：https://my.oschina.net/penngo/blog/842108

版权

为什么80%的码农都做不了架构师？>>>

使用mavn管理相关依赖包pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.spark.test</groupId>
  <artifactId>spark_test</artifactId>
  <packaging>jar</packaging>
  <version>1.0-SNAPSHOT</version>
  <name>spark_test</name>
  <url>http://maven.apache.org</url>
  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>
	<dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>2.1.0</version>
    </dependency>
	<dependency>
	    <groupId>org.apache.spark</groupId>
		<artifactId>spark-sql_2.11</artifactId>
		<version>2.1.0</version>
	</dependency>
	<dependency>
		<groupId>org.apache.spark</groupId>
		<artifactId>spark-mllib_2.11</artifactId>
		<version>2.1.0</version>
	</dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.11</artifactId>
      <version>2.1.0</version>
    </dependency>
	<dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.2.0</version>
    </dependency>
	<dependency>
		<groupId>org.apache.hbase</groupId>
		<artifactId>hbase-client</artifactId>
		<version>1.2.4</version>
	</dependency>
      <!--
      <dependency>
          <groupId>org.apache.hbase</groupId>
          <artifactId>hbase-spark</artifactId>
          <version>1.2.0-cdh5.9.0</version>
      </dependency>
      -->
      <dependency>
          <groupId>org.apache.hbase</groupId>
          <artifactId>hbase-server</artifactId>
          <version>1.2.4</version>
      </dependency>
      <!--
	  <dependency>
		  <groupId>org.lionsoul</groupId>
		  <artifactId>jcseg-core</artifactId>
		  <version>2.1.0</version>
	  </dependency>
      -->
      <dependency>
          <groupId>org.mongodb</groupId>
          <artifactId>mongodb-driver</artifactId>
          <version>3.3.0</version>
      </dependency>
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.10.1</version>
      </dependency>
      <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.5</version>
      </dependency>
      <!--
      <dependency>
          <groupId>com.alibaba</groupId>
          <artifactId>fastjson</artifactId>
          <version>1.2.21</version>
      </dependency>
      -->
      <dependency>
          <groupId>commons-dbutils</groupId>
          <artifactId>commons-dbutils</artifactId>
          <version>1.6</version>
      </dependency>
      <dependency>
          <groupId>mysql</groupId>
          <artifactId>mysql-connector-java</artifactId>
          <version>6.0.5</version>
      </dependency>
  </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

把mongodb数据导入hbase,方便spark分析统计

package com.spark.test.export;

import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.Sorts;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.log4j.Logger;
import org.bson.Document;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;

/**
 * mongodb数据保存到hbase
 */
public class MongoHbase {
    private static Logger log = Logger.getLogger(MongoHbase.class);

    private static String HBASE_COLUMN_NAME = "column";
    private static String HBASE_TABLE = "news";
    private static String MONGO_TABLE = "news";
    private Configuration hbaseConfig;
    private MongoDatabase database;
    private MongoCollection<Document> collection;
    //    private Table hbaseTable;
    private BufferedMutator hbaseMutatorTable;

    public MongoHbase(){
        initHbase();
        initMongodb();
    }
    private void initHbase(){
        hbaseConfig = HBaseConfiguration.create();
        hbaseConfig.set("hbase.zookeeper.property.clientPort", "2181");
        hbaseConfig.set("hbase.zookeeper.quorum", "192.168.1.45");
    }
    private void initMongodb(){
        MongoClient mongoClient = new MongoClient("192.168.1.44" , 27017);
        database = mongoClient.getDatabase("db_news");
        collection = database.getCollection(MONGO_TABLE);
    }

    /**
     * 新闻数据保存到hbase
     * @param startTime
     * @param endTime
     * @param collection
     * @param count
     * @throws Exception
     */
    public void saveDataToHbase(long startTime, long  endTime, MongoCollection<Document> collection, long count) throws Exception{

        int pageSize = 100;
        long totalPage = (long)Math.ceil((double)count  / pageSize );
        Whitelist html_filter = Whitelist.none();
        for(int page = 0; page < totalPage; page++){
            MongoCursor<Document> cursor = collection.find(Filters.and(Filters.gte("publish_timestamp", startTime),
                    Filters.lt("publish_timestamp", endTime), Filters.eq("status", 1))).sort(Sorts.descending("publish_timestamp"))
                    .skip(page * pageSize).limit(pageSize).iterator();
            List<Mutation> mutations = new ArrayList<Mutation>();
            while(cursor.hasNext()){
                // 获取mongodb数据
                Document doc = cursor.next();
                String id = doc.getString("id");
                String title = doc.getString("title");
                String content = doc.getString("content");
                content = Jsoup.clean(content, html_filter); // 去掉文章中的html
                String publishtime = doc.getString("publishtime");
                String source = doc.getString("source");
                Long publish_timestamp = doc.getLong("publish_timestamp");
                String rowKey = id;

                // 保存到hbase
                String url = doc.getString("url");
                Long status = Double.valueOf(doc.get("status").toString()).longValue();
                Put put = new Put(rowKey.getBytes());
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "id".getBytes(), id.getBytes());
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "title".getBytes(), title.getBytes("UTF-8"));
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "content".getBytes(), content.getBytes("UTF-8"));
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "publishtime".getBytes(), publishtime.getBytes());
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "source".getBytes(), source.getBytes("UTF-8"));
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "publish_timestamp".getBytes(), publish_timestamp.toString().getBytes());
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "url".getBytes(), url.getBytes());
                put.addColumn(HBASE_COLUMN_NAME.getBytes(), "status".getBytes(), status.toString().getBytes());

                mutations.add(put);
                if(mutations.size() > 1000){
                    hbaseMutatorTable.mutate(mutations);
                    hbaseMutatorTable.flush();
                    mutations.clear();
                    mutations = new ArrayList<Mutation>();
                }
            }
            cursor.close();
            if(mutations.size() > 0){
                hbaseMutatorTable.mutate(mutations);
                hbaseMutatorTable.flush();
                mutations.clear();
            }
        }
    }

    public void run(int dataCount){
        Date currentDate = new Date();
        Calendar startCalendar = Calendar.getInstance();
        startCalendar.setTime(currentDate);
        startCalendar.set(Calendar.HOUR_OF_DAY, 0);
        startCalendar.set(Calendar.MINUTE, 0);
        startCalendar.set(Calendar.SECOND, 0);
        startCalendar.set(Calendar.MILLISECOND, 0);
        Calendar endCalendar = Calendar.getInstance();


        Calendar stopCalendar = Calendar.getInstance();
        stopCalendar.setTime(startCalendar.getTime());
        stopCalendar.add(Calendar.DATE, -dataCount);
        try{
            while(startCalendar.getTime().getTime() > stopCalendar.getTime().getTime()){
                Date date = startCalendar.getTime();
                endCalendar.setTime(date);
                endCalendar.add(Calendar.DATE, 1);
                long startTime = startCalendar.getTimeInMillis() / 1000;
                long endTime = endCalendar.getTimeInMillis() / 1000;

                long count = collection.count(Filters.and(
                        Filters.gte("publish_timestamp", startTime),
                        Filters.lt("publish_timestamp", endTime),
                        Filters.eq("status", 1)));

                if(count > 0){
                    saveDataToHbase(startTime, endTime, collection, count);
                }
                startCalendar.add(Calendar.DATE, -1);
            }
        }
        catch(Exception e){
            log.error("保存数据失败====", e);
        }

    }

    public static void main(String... args) throws Exception {
        MongoHbase MongoHbase = new MongoHbase();
        MongoHbase.run(7); // 一个星期数据
    }
}

spark读取hbase数据分析（以tfidf为例子）

package com.spark.test.export;

import com.spark.test.tfidf.zhuti.MongoData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;

/**
 * spark读取hbase数据作统计
 */
public class SparkHbase {
    private static Logger log = Logger.getLogger(SparkHbase.class);
    private static SparkSession spark = null;
    public static String splitTag = "@==@";
    private static Configuration hbaseConfig;
    private static String HBASE_COLUMN_NAME = "column";
    private static String HBASE_TABLE = "news";


    /**
     * 初始化spark
     * @return
     */
    private static SparkSession initSpark(String name) {
        // cache时序列化
        System.setProperty("spark.serializer",
                "org.apache.spark.serializer.KryoSerializer");
        if (spark == null) {
            String os = System.getProperty("os.name").toLowerCase();
            log.info("os.name====" + os + " " + os.indexOf("windows"));
            // linux上运行
            if(os.indexOf("windows") == -1){
                spark = SparkSession
                        .builder()
                        .appName("Spark_" + name)
                        .getOrCreate();
            }
            // window上运行调试
            else{
                System.setProperty("hadoop.home.dir", "D:/hadoop/hadoop-2.6.4");
                System.setProperty("HADOOP_USER_NAME", "root");
                spark = SparkSession
                        .builder()
                        .appName("Spark_" + name).master("spark://spark-serv1:7077")
                        .getOrCreate();

            }

        }

        return spark;
    }
    private static void initHbaseConf(){

    }
    private static void initHbaseTable(){
        if(hbaseConfig == null){
            hbaseConfig = HBaseConfiguration.create();
            hbaseConfig.set("hbase.zookeeper.property.clientPort", "2181");
            hbaseConfig.set("hbase.zookeeper.quorum", "192.168.1.45");
        }
        try {
            Connection connection = ConnectionFactory.createConnection(hbaseConfig);
//            Admin admin = connection.getAdmin();
//            TableName tableName = TableName.valueOf(HBASE_TABLE);
            hbaseConfig.set(TableInputFormat.INPUT_TABLE, HBASE_TABLE);
            Scan scan = new Scan();
            ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
            String ScanToString = Base64.encodeBytes(proto.toByteArray());
            hbaseConfig.set(TableInputFormat.SCAN, ScanToString);
        } catch (Exception e) {
            log.error("initHbaseTable error=========", e);
            e.printStackTrace();
        }

    }
    /**
     *
     * @param type all(全量)|delta（增量）
     * @return
     */
    private static Dataset<Row> readHbase(final String type) throws Exception{
        log.info("spark.sparkContext().defaultParallelism()=====" + spark.sparkContext().defaultParallelism());
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());//.addJar();

        JavaPairRDD<ImmutableBytesWritable, Result> javaPairRDD = jsc.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class,
                Result.class);
        // 过滤hbase时间
        JavaPairRDD<ImmutableBytesWritable, Result> filterRDD = javaPairRDD.filter(new Function<Tuple2<ImmutableBytesWritable, Result>, Boolean>(){
            public Boolean call(Tuple2<ImmutableBytesWritable, Result> tuple2) throws Exception{
                Result result = tuple2._2();
                String id = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "id".getBytes()));
                String content = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "content".getBytes()));

                String publish_timestamp = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "publish_timestamp".getBytes()));
                Long time = Long.valueOf(publish_timestamp);
                if(content == null || content.trim().equals("")){
                    return false;
                }
                if("all".equals(type) == true && (time > (System.currentTimeMillis() / 1000 - 60 * 60 * 24 * 365))){
                    return true;
                }
                // 增量数据
                else if("delta".equals(type) == true && (time > (System.currentTimeMillis() / 1000 - 60 * 60 * 24 * 7))){
                }
                return false;
            }
        });

        filterRDD = filterRDD.repartition(10);
        JavaRDD<MongoData> javaRdd = filterRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, MongoData>() {
            public MongoData call(Tuple2<ImmutableBytesWritable, Result> tuple2) throws Exception{
                Result result = tuple2._2();
                String id = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "id".getBytes()));
                String title = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "title".getBytes()));
                String content = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "content".getBytes()));
                String source = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "source".getBytes()));
                String publish_timestamp = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "publish_timestamp".getBytes()));
                String url = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "url".getBytes()));
                String data_type = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "data_type".getBytes()));
                String summary = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "summary".getBytes()));

                MongoData mongoData = new MongoData();
                mongoData.setId(id);
                mongoData.setTitle(title);
                mongoData.setSource(source);
                mongoData.setSegment(content);
                mongoData.setPublish_timestamp(publish_timestamp);
                mongoData.setUrl(url);
                mongoData.setData_type(data_type);
                mongoData.setSummary(summary);
                return mongoData;
            }
        });

        Dataset<Row> dataset = spark.createDataFrame(
                javaRdd,
                MongoData.class
        );
        //dataset.show();
        return dataset;
    }


    private static Dataset<Row> tfidf(Dataset<Row> dataset) {
        Tokenizer tokenizer = new Tokenizer().setInputCol("segment").setOutputCol("words");
        Dataset<Row> wordsData = tokenizer.transform(dataset);
        //int numFeatures = 20;
        HashingTF hashingTF = new HashingTF()
                .setInputCol("words")
                .setOutputCol("rawFeatures");
        Dataset<Row> featurizedData = hashingTF.transform(wordsData);
        IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
        IDFModel idfModel = idf.fit(featurizedData);
        Dataset<Row> rescaledData = idfModel.transform(featurizedData);
        return rescaledData;
    }

    public static void run(String type){
        initSpark(type);
        initHbaseTable();
        try{
            Dataset<Row> dataset = readHbase(type);
            Dataset<Row> tfidfDataset = tfidf(dataset);
            // 数据保存到hdfs, repartition(1)保存为一个文件
            tfidfDataset.select("title", "words", "rawFeatures", "features", "segment")
                    .repartition(1).write().text("hdfs://192.168.1.45:9000/tfidf");
        }
        catch(Exception e){
            log.error("数据出错===", e);
        }
    }

    public static void main(String... args) throws Exception {
        run("all");
//        run("delta");
    }
}

转载于:https://my.oschina.net/penngo/blog/842108