使用mavn管理相关依赖包pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.spark.test</groupId>
<artifactId>spark_test</artifactId>
<packaging>jar</packaging>
<version>1.0-SNAPSHOT</version>
<name>spark_test</name>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.4</version>
</dependency>
<!--
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.2.0-cdh5.9.0</version>
</dependency>
-->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.4</version>
</dependency>
<!--
<dependency>
<groupId>org.lionsoul</groupId>
<artifactId>jcseg-core</artifactId>
<version>2.1.0</version>
</dependency>
-->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongodb-driver</artifactId>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<!--
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.21</version>
</dependency>
-->
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>6.0.5</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
把mongodb数据导入hbase,方便spark分析统计
package com.spark.test.export;
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.Sorts;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.log4j.Logger;
import org.bson.Document;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
/**
* mongodb数据保存到hbase
*/
public class MongoHbase {
private static Logger log = Logger.getLogger(MongoHbase.class);
private static String HBASE_COLUMN_NAME = "column";
private static String HBASE_TABLE = "news";
private static String MONGO_TABLE = "news";
private Configuration hbaseConfig;
private MongoDatabase database;
private MongoCollection<Document> collection;
// private Table hbaseTable;
private BufferedMutator hbaseMutatorTable;
public MongoHbase(){
initHbase();
initMongodb();
}
private void initHbase(){
hbaseConfig = HBaseConfiguration.create();
hbaseConfig.set("hbase.zookeeper.property.clientPort", "2181");
hbaseConfig.set("hbase.zookeeper.quorum", "192.168.1.45");
}
private void initMongodb(){
MongoClient mongoClient = new MongoClient("192.168.1.44" , 27017);
database = mongoClient.getDatabase("db_news");
collection = database.getCollection(MONGO_TABLE);
}
/**
* 新闻数据保存到hbase
* @param startTime
* @param endTime
* @param collection
* @param count
* @throws Exception
*/
public void saveDataToHbase(long startTime, long endTime, MongoCollection<Document> collection, long count) throws Exception{
int pageSize = 100;
long totalPage = (long)Math.ceil((double)count / pageSize );
Whitelist html_filter = Whitelist.none();
for(int page = 0; page < totalPage; page++){
MongoCursor<Document> cursor = collection.find(Filters.and(Filters.gte("publish_timestamp", startTime),
Filters.lt("publish_timestamp", endTime), Filters.eq("status", 1))).sort(Sorts.descending("publish_timestamp"))
.skip(page * pageSize).limit(pageSize).iterator();
List<Mutation> mutations = new ArrayList<Mutation>();
while(cursor.hasNext()){
// 获取mongodb数据
Document doc = cursor.next();
String id = doc.getString("id");
String title = doc.getString("title");
String content = doc.getString("content");
content = Jsoup.clean(content, html_filter); // 去掉文章中的html
String publishtime = doc.getString("publishtime");
String source = doc.getString("source");
Long publish_timestamp = doc.getLong("publish_timestamp");
String rowKey = id;
// 保存到hbase
String url = doc.getString("url");
Long status = Double.valueOf(doc.get("status").toString()).longValue();
Put put = new Put(rowKey.getBytes());
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "id".getBytes(), id.getBytes());
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "title".getBytes(), title.getBytes("UTF-8"));
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "content".getBytes(), content.getBytes("UTF-8"));
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "publishtime".getBytes(), publishtime.getBytes());
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "source".getBytes(), source.getBytes("UTF-8"));
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "publish_timestamp".getBytes(), publish_timestamp.toString().getBytes());
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "url".getBytes(), url.getBytes());
put.addColumn(HBASE_COLUMN_NAME.getBytes(), "status".getBytes(), status.toString().getBytes());
mutations.add(put);
if(mutations.size() > 1000){
hbaseMutatorTable.mutate(mutations);
hbaseMutatorTable.flush();
mutations.clear();
mutations = new ArrayList<Mutation>();
}
}
cursor.close();
if(mutations.size() > 0){
hbaseMutatorTable.mutate(mutations);
hbaseMutatorTable.flush();
mutations.clear();
}
}
}
public void run(int dataCount){
Date currentDate = new Date();
Calendar startCalendar = Calendar.getInstance();
startCalendar.setTime(currentDate);
startCalendar.set(Calendar.HOUR_OF_DAY, 0);
startCalendar.set(Calendar.MINUTE, 0);
startCalendar.set(Calendar.SECOND, 0);
startCalendar.set(Calendar.MILLISECOND, 0);
Calendar endCalendar = Calendar.getInstance();
Calendar stopCalendar = Calendar.getInstance();
stopCalendar.setTime(startCalendar.getTime());
stopCalendar.add(Calendar.DATE, -dataCount);
try{
while(startCalendar.getTime().getTime() > stopCalendar.getTime().getTime()){
Date date = startCalendar.getTime();
endCalendar.setTime(date);
endCalendar.add(Calendar.DATE, 1);
long startTime = startCalendar.getTimeInMillis() / 1000;
long endTime = endCalendar.getTimeInMillis() / 1000;
long count = collection.count(Filters.and(
Filters.gte("publish_timestamp", startTime),
Filters.lt("publish_timestamp", endTime),
Filters.eq("status", 1)));
if(count > 0){
saveDataToHbase(startTime, endTime, collection, count);
}
startCalendar.add(Calendar.DATE, -1);
}
}
catch(Exception e){
log.error("保存数据失败====", e);
}
}
public static void main(String... args) throws Exception {
MongoHbase MongoHbase = new MongoHbase();
MongoHbase.run(7); // 一个星期数据
}
}
spark读取hbase数据分析(以tfidf为例子)
package com.spark.test.export;
import com.spark.test.tfidf.zhuti.MongoData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
/**
* spark读取hbase数据作统计
*/
public class SparkHbase {
private static Logger log = Logger.getLogger(SparkHbase.class);
private static SparkSession spark = null;
public static String splitTag = "@==@";
private static Configuration hbaseConfig;
private static String HBASE_COLUMN_NAME = "column";
private static String HBASE_TABLE = "news";
/**
* 初始化spark
* @return
*/
private static SparkSession initSpark(String name) {
// cache时序列化
System.setProperty("spark.serializer",
"org.apache.spark.serializer.KryoSerializer");
if (spark == null) {
String os = System.getProperty("os.name").toLowerCase();
log.info("os.name====" + os + " " + os.indexOf("windows"));
// linux上运行
if(os.indexOf("windows") == -1){
spark = SparkSession
.builder()
.appName("Spark_" + name)
.getOrCreate();
}
// window上运行调试
else{
System.setProperty("hadoop.home.dir", "D:/hadoop/hadoop-2.6.4");
System.setProperty("HADOOP_USER_NAME", "root");
spark = SparkSession
.builder()
.appName("Spark_" + name).master("spark://spark-serv1:7077")
.getOrCreate();
}
}
return spark;
}
private static void initHbaseConf(){
}
private static void initHbaseTable(){
if(hbaseConfig == null){
hbaseConfig = HBaseConfiguration.create();
hbaseConfig.set("hbase.zookeeper.property.clientPort", "2181");
hbaseConfig.set("hbase.zookeeper.quorum", "192.168.1.45");
}
try {
Connection connection = ConnectionFactory.createConnection(hbaseConfig);
// Admin admin = connection.getAdmin();
// TableName tableName = TableName.valueOf(HBASE_TABLE);
hbaseConfig.set(TableInputFormat.INPUT_TABLE, HBASE_TABLE);
Scan scan = new Scan();
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
String ScanToString = Base64.encodeBytes(proto.toByteArray());
hbaseConfig.set(TableInputFormat.SCAN, ScanToString);
} catch (Exception e) {
log.error("initHbaseTable error=========", e);
e.printStackTrace();
}
}
/**
*
* @param type all(全量)|delta(增量)
* @return
*/
private static Dataset<Row> readHbase(final String type) throws Exception{
log.info("spark.sparkContext().defaultParallelism()=====" + spark.sparkContext().defaultParallelism());
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());//.addJar();
JavaPairRDD<ImmutableBytesWritable, Result> javaPairRDD = jsc.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class,
Result.class);
// 过滤hbase时间
JavaPairRDD<ImmutableBytesWritable, Result> filterRDD = javaPairRDD.filter(new Function<Tuple2<ImmutableBytesWritable, Result>, Boolean>(){
public Boolean call(Tuple2<ImmutableBytesWritable, Result> tuple2) throws Exception{
Result result = tuple2._2();
String id = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "id".getBytes()));
String content = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "content".getBytes()));
String publish_timestamp = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "publish_timestamp".getBytes()));
Long time = Long.valueOf(publish_timestamp);
if(content == null || content.trim().equals("")){
return false;
}
if("all".equals(type) == true && (time > (System.currentTimeMillis() / 1000 - 60 * 60 * 24 * 365))){
return true;
}
// 增量数据
else if("delta".equals(type) == true && (time > (System.currentTimeMillis() / 1000 - 60 * 60 * 24 * 7))){
}
return false;
}
});
filterRDD = filterRDD.repartition(10);
JavaRDD<MongoData> javaRdd = filterRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, MongoData>() {
public MongoData call(Tuple2<ImmutableBytesWritable, Result> tuple2) throws Exception{
Result result = tuple2._2();
String id = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "id".getBytes()));
String title = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "title".getBytes()));
String content = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "content".getBytes()));
String source = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "source".getBytes()));
String publish_timestamp = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "publish_timestamp".getBytes()));
String url = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "url".getBytes()));
String data_type = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "data_type".getBytes()));
String summary = new String(result.getValue(HBASE_COLUMN_NAME.getBytes(), "summary".getBytes()));
MongoData mongoData = new MongoData();
mongoData.setId(id);
mongoData.setTitle(title);
mongoData.setSource(source);
mongoData.setSegment(content);
mongoData.setPublish_timestamp(publish_timestamp);
mongoData.setUrl(url);
mongoData.setData_type(data_type);
mongoData.setSummary(summary);
return mongoData;
}
});
Dataset<Row> dataset = spark.createDataFrame(
javaRdd,
MongoData.class
);
//dataset.show();
return dataset;
}
private static Dataset<Row> tfidf(Dataset<Row> dataset) {
Tokenizer tokenizer = new Tokenizer().setInputCol("segment").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(dataset);
//int numFeatures = 20;
HashingTF hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures");
Dataset<Row> featurizedData = hashingTF.transform(wordsData);
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
Dataset<Row> rescaledData = idfModel.transform(featurizedData);
return rescaledData;
}
public static void run(String type){
initSpark(type);
initHbaseTable();
try{
Dataset<Row> dataset = readHbase(type);
Dataset<Row> tfidfDataset = tfidf(dataset);
// 数据保存到hdfs, repartition(1)保存为一个文件
tfidfDataset.select("title", "words", "rawFeatures", "features", "segment")
.repartition(1).write().text("hdfs://192.168.1.45:9000/tfidf");
}
catch(Exception e){
log.error("数据出错===", e);
}
}
public static void main(String... args) throws Exception {
run("all");
// run("delta");
}
}