Hi:
小明:sir,我遇见一个需求,根据42万手机号码段生产全量的手机号。
大牛:what,那岂不是要生成42亿多手机号。
小明:yes,而且还需要每次随机提取一千万条,第二次提取不能包含上一次的手机号。
大牛:shit,good luck。
思考时间……
大牛:管它三七二十一,一个循环跑去吧。然后提取从头开始循环然后依次随机跳过几十条数据。
小明:那得跑多久,人家明天就要。
大牛:那就用Hive的分桶,《Hive权威指南》里不是说分桶最适合采样吗~,然后采用数据放在另外一个Hive表,两个做Join进行去重。
小明:嗯,好像可以呀,大牛就是大牛。不过运营业务人员很懒,想从页面点击个按钮就想直接导出了结果。
大牛:嗯,好吧,那么我们用这个方案,先用Spark/MapReduce生产Hfile文件,一条命令直接怼到Hbase,然后根据RandomRowFilter过滤器进行样本提取,随机因子0.005f,也就是说一千条里面随机提取5个手机号,生产Csv文件流直接怼到他们浏览器上。
小明:哇,这个太棒了,但是怎么实现呢?
大牛:嗯,给我一首歌的时间~
一、CreateHfile:
object CreateHfile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CreateHfile").setMaster(args(0))
val sc = new SparkContext(conf)
val hbaseConf = HBaseConfiguration.create()
//
val rdd = sc.textFile(args(1))
.flatMap(v =>{
val x = new javaList[String]()
for( a <- 1 to 9999){
x.add(v + "%04d".format(a))
}
x.toArray
})
.sortBy(v=>v.toString)
.map(r =>(new ImmutableBytesWritable(Bytes.toBytes(r.toString)),
new KeyValue(Bytes.toBytes(r.toString), Bytes.toBytes("phoneFamliy"), Bytes.toBytes("phoneCol"), Bytes.toBytes(1))))
rdd.saveAsNewAPIHadoopFile(args(2), classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2], hbaseConf)
sc.stop()
}
}
二、生产Hfile,执行命令:
spark-submit --master yarn --class com.dw.spark.CreateHfile /home/hdfs/dp/createPhone.jar yarn hdfs://xxx.xxx.xxx.xxx:8020/user/Phone.txt hdfs://xxx.xxx.xxx.xxx:8020/user/phoneout0401_cre/
三、创建HBASE表:
create 'dp_phone_42yi', {NAME => 'phoneFamliy', VERSIONS => 5}
四、怼到HBASE,执行命令:
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /user/phoneout0401_cre dp_phone_42yi
五、验证奇迹的时候到了:
scan 'dp_phone_42yi2',{LIMIT=>100}
六、随机提取一千万条:
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.RandomRowFilter;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* https://blog.csdn.net/u013870094/article/details/80091256
* Created by geo on 2019/3/29. */
public class HbaseService {
private final Config config = ConfigFactory.load().getConfig("main");
private static Connection conn = null;
public HbaseService() {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", config.getString("hbase_zookeeper_quorum"));
conf.set("hbase.rootdir", config.getString("hbase_rootdir"));
try {
conn = ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
16,RandomRowFilter
参数小于0时一条查不出大于1值会返回所有,而想取随机行的话有效区间为0~1,值代表取到的几率
*/
public void RandomRowFilter(String tableName,String output) throws Exception {
List<String> phoneList = new ArrayList<String>();
Long nun=0L;
Table table = conn.getTable(TableName.valueOf(tableName));
//该构造参数0-1之间,如果为负数全部过滤,大于1全部通过 0.2f表的该行数据20%的概率通过
Filter filter = new RandomRowFilter(Float.parseFloat(config.getString("random")));
Scan scan = new Scan();
scan.setFilter(filter);
scan.setMaxResultSize(Integer.parseInt(config.getString("maxResultSize")));
ResultScanner results = table.getScanner(scan);
for (Result result:results) {
for (Cell cell : result.rawCells()) {
// String f=Bytes.toString(cell.getFamilyArray(),cell.getFamilyOffset(),cell.getFamilyLength());
// String col= Bytes.toString(cell.getQualifierArray(),cell.getQualifierOffset(),cell.getQualifierLength()) ;
// String v=Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
String r=Bytes.toString(cell.getQualifierArray(),cell.getRowOffset(),cell.getRowLength()) ;
nun++;
System.out.println(nun) ;
phoneList.add(r);
}
if(nun >= Integer.parseInt(config.getString("phoneCount"))) break;
if(phoneList.size() > 10000){
writeFileContext(phoneList,output);
delPhoneList(table,phoneList);
phoneList.clear();
}
}
writeFileContext(phoneList,output);
delPhoneList(table,phoneList);
table.close();
conn.close();
File fileOK = new File(output + "phonelist.txt_OK");
//如果没有文件就创建
if (!fileOK.isFile()) {
fileOK.createNewFile();
}
}
private void delPhoneList(Table table,List<String> phoneList)throws Exception{
List<Delete> deleteList = new ArrayList<Delete>() ;
for (String phone:phoneList){
deleteList.add(new Delete(Bytes.toBytes(phone))) ;
}
table.delete(deleteList);
}
/**
* 将list按行写入到txt文件中
* @param strings phonelist
* @param path 输出路径
* @throws Exception 异常
*/
private void writeFileContext(List<String> strings, String path) throws Exception {
File file = new File(path + "phonelist.txt");
//如果没有文件就创建
if (!file.isFile()) {
file.createNewFile();
}
BufferedWriter writer = new BufferedWriter(new FileWriter(file, true));
for (String phone:strings){
writer.write(phone + "\n");
}
writer.close();
}
}
入口:
public class Application {
public static void main(String[] args)throws Exception {
HbaseService hbaseService = new HbaseService();
String tableName = args[0];
String output = args[1] ;
hbaseService.RandomRowFilter(tableName,output);
}
}
大牛:现在可以了吧?
小明:他们又提了个需求,想从42亿全量手机号里删掉咱们存量的3亿手机号,还得您老来给个方案。
大牛:嗯原理一样,往着看。
DeleteHfile:
import java.util.{ArrayList => javaList}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by geo on 2019/3/27. */
object DeleteHfile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CreateHfile").setMaster(args(0))
val sc = new SparkContext(conf)
val hbaseConf = HBaseConfiguration.create()
//
val rdd = sc.textFile(args(1))
.flatMap(v =>{
val x = new javaList[String]()
for( a <- 1 to 9999){
x.add(v + "%04d".format(a))
}
x.toArray
})
.sortBy(v=>v.toString)
.map(r =>(new ImmutableBytesWritable(Bytes.toBytes(r.toString)),
new KeyValue(Bytes.toBytes(r.toString), Bytes.toBytes("phoneFamliy"), Bytes.toBytes("phoneCol"), System.currentTimeMillis(),KeyValue.Type.DeleteColumn)))
rdd.saveAsNewAPIHadoopFile(args(2), classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2], hbaseConf)
sc.stop()
}
}
大牛:然后和上面一样,怼到HBASE里面就行。
小明:得了您内,弱弱地问一句,能把pom文件贴出来吗?
大牛:嗯、可以,但是你要根据自己的spark和scala版本找响应的jar啊,如果有jar包冲突,就maven看一下jar依赖,快捷键ctrl+alt+shift+u,find一下就知道了。
pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>geo.bigdata</groupId>
<artifactId>dw</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.10.5</scala.version>
<spark.version>1.6.3</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.5</version>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>jsp-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.1.2</version>
<!--排除这个slf4j-log4j12-->
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>jsp-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.4.8</version>
<!--排除这个slf4j-log4j12-->
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>jsp-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- spark begin-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>${spark.version}</version>
<!-- spark默认使用log4j,我们需要使用logback,所以将讲spark的log4j依赖进行移除-->
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
<exclusion>
<artifactId>slf4j-api</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty.orbit</groupId>
<artifactId>javax.servlet</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.0.1</version>
</dependency>
<!-- spark end-->
<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<id>eclipse-add-source</id>
<goals>
<goal>add-source</goal>
</goals>
</execution>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile-first</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-unchecked</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<!-- 此处指定main方法入口的class -->
<mainClass>com.geotmt.dw.spark.CreateHfile</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
<!--plugin元素包含描述插件所需要的信息。-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<!--Compiler 插件包含编译源代码和单元测试代码的目标-->
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<!-- “编码 GBK 的不可映射字符”问题的解决 -->
<encoding>utf-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dpTools</artifactId>
<groupId>com.geotmt.dp</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>getPhone</artifactId>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<skipTests>true</skipTests>
<spirng.data.hadoop.version>2.5.0.RELEASE</spirng.data.hadoop.version>
<hadoop.version>2.6.5</hadoop.version>
<hbase.client.version>1.4.8</hbase.client.version>
<spring.boot.version>1.4.7.RELEASE</spring.boot.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.client.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe</groupId>
<artifactId>config</artifactId>
<version>1.2.1</version>
</dependency>
</dependencies>
<!-- 打包 begin -->
<profiles>
<profile>
<id>dev</id>
<properties>
<profileActive>dev</profileActive>
</properties>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
</profile>
<profile>
<id>prod</id>
<properties>
<profileActive>prod</profileActive>
</properties>
</profile>
<profile>
<id>test</id>
<properties>
<profileActive>test</profileActive>
</properties>
</profile>
</profiles>
<!-- 打包 end -->
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<filtering>true</filtering>
<excludes>
<exclude>dev/*</exclude>
<exclude>prod/*</exclude>
<exclude>test/*</exclude>
</excludes>
</resource>
<resource>
<directory>src/main/resources/${profileActive}</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<!-- 此处指定main方法入口的class -->
<mainClass>com.geotmt.dp.Application</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
<!--plugin元素包含描述插件所需要的信息。-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<!--Compiler 插件包含编译源代码和单元测试代码的目标-->
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<!-- “编码 GBK 的不可映射字符”问题的解决 -->
<encoding>utf-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>