问题的另外一种表达方式
基于spark 实现用户行为路径分析,并且穷举所有路径
多叉树实现
需求:把一张表,具有两个字段,父节点,子节点{"from","to"},把这张表从根节点开始,从根节点到最终的叶子节点,把所有可能的路径输出来 这张表应该是这样的 from to a b a c a d b e b f c g c h c i ... ... 另外的,再输入一个参数,比方说是 c, 那么请把从c开始,c到根节点的路径输出来 c->b->a 请把c到叶子节点的路径输出来,结果应该是这样的 c->g c->h c->i 解题思路流程: 1.把两列数据聚合成为一列1*n表,用:连接两列数据(内建函数即可) a:b a:c a:d . . . 2.把步骤1结果聚合成为一个1*1格子,使用UDAF a:b;a:c;a:d;... 3.把步骤2结果轰炸成一张n*1表,用UDTF c->g c->h c->i c->b->a 4.最后一步,把步骤3结果用c去过滤,这样的好处是,只需要计算一次全路径,以后每一次过滤都非常快,消耗时间少 其他思路: 在每一步进行中都用目标参数c去过滤,不容易OOM hive数据准备 建表 drop table if exists path; use wujw; create table if not exists path(pid string,id string);//id是自己的id,pid是自己父节点的id 插入测试数据 insert into path values('a','b'); insert into path values('a','c'); insert into path values('b','d'); insert into path values('b','e'); insert into path values('c','f'); insert into path values('c','g'); insert into path values('d','h'); insert into path values('d','i'); insert into path values('e','j'); insert into path values('e','k'); insert into path values('f','l'); insert into path values('f','m'); insert into path values('g','n'); insert into path values('g','o'); insert into path values('h','p'); insert into path values('h','q'); insert into path values('i','r'); insert into path values('i','s'); insert into path values('j','t'); insert into path values('j','u'); insert into path values('k','v'); insert into path values('k','w'); insert into path values('l','x'); insert into path values('l','y'); insert into path values('m','z'); 1.如果有空值,必须打上空值记号 2.连接两列: drop table if exists relationship; create table if not exists relationship(grp string, kv string); insert into table relationship select '1',concat_ws(':',id,pid) from path; 3.把步骤1结果聚合成为一个1*1格子,可能使用UDAF select grp, kv from relationship; select grp, concat_ws(',',collect_list(kv)) as result from relationship group by grp;//concat_ws不能用分号分割 drop table if exists result; use wujw; create table if not exists result(res string); insert into table result select concat_ws(',',collect_list(kv)) as res from relationship ; select res from result; 4.把步骤2结果轰炸成一张n*1表,用hive UDTF,表格内容是所有路径
pom代码:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.creaway</groupId>
<artifactId>UDFdev</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
UDTF参考: java
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class ExplodeMap extends GenericUDTF{
@Override
public void close() throws HiveException {
// TODO Auto-generated method stub
}
@Override
public StructObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
}
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("col1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("col2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String input = args[0].toString();
String[] test = input.split(";");
for(int i=0; i<test.length; i++) {
try {
String[] result = test[i].split(":");
forward(result);
} catch (Exception e) {
continue;
}
}
}
}
多叉树遍历路径参考:java
package com.creaway.UDF;
import java.util.*;
public class Department {
private int id;
private int pid;
private String name;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public int getPid() {
return pid;
}
public void setPid(int pid) {
this.pid = pid;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Department(int id, int pid, String name) {
this.id = id;
this.pid = pid;
this.name = name;
}
//递归查询子部门---lists所有部门合集,set为id合集,
private static Set<Integer> intset = new HashSet<>();
private static void getChiildrenDepartment(List<Department> lists, Set<Integer> set) {
Set<Integer> set1 = new HashSet<>();
for (Department depart : lists) {
for (Integer i : set) {
if (i.equals(new Integer(depart.getPid()))) {
set1.add(depart.getId());
intset.add(depart.getId());
getChiildrenDepartment(lists, set1);
}
}
}
}
//递归查询父部门---lists所有部门合集,set为id合集,topId为顶层id
private static void getParentDepartmnet(List<Department> lists, Set<Integer> set, int topId){
Set<Integer> set1 = new HashSet<>();
for (Department depart : lists) {
for (Integer i : set) {
if (i.equals(new Integer(depart.getId()))) {
set1.add(depart.getPid());
intset.add(depart.getPid());
}
if( depart.getPid() == topId){
continue;
}else {
getParentDepartmnet(lists, set1, topId);
}
}
}
}
public static void main(String[] args) {
Department d1 = new Department(1, 0, "总部");
Department d2 = new Department(2, 0, "分部2");
Department d3 = new Department(3, 0, "分部3");
Department d4 = new Department(4, 1, "分部4");
Department d5 = new Department(5, 1, "分部5子部1");
Department d6 = new Department(6, 1, "分部2子部2");
Department d7 = new Department(7, 2, "分部3子部1");
Department d8 = new Department(8, 2, "分部3子部1");
Department d9 = new Department(9, 2, "分部3子部1");
Department d10 = new Department(10, 3, "分部3子部1");
Department d11 = new Department(11, 3, "分部3子部1");
Department d12 = new Department(12, 3, "分部3子部1");
Department d13 = new Department(13, 4, "分部3子部1");
Department d14 = new Department(14, 4, "分部3子部1");
Department d15 = new Department(15, 5, "分部3子部1");
Department d16 = new Department(16, 6, "分部3子部1");
Department d17 = new Department(17, 6, "分部3子部1");
Department d18 = new Department(18, 6, "分部3子部1");
Department d19 = new Department(19, 9, "分部3子部1");
Department d20 = new Department(20, 10, "分部3子部1");
Department d21 = new Department(21, 10, "分部3子部1");
Department d22 = new Department(22, 16, "分部3子部1");
Department d23 = new Department(23, 16, "分部3子部1");
Department d24 = new Department(24, 16, "分部3子部1");
Department d25 = new Department(25, 16, "分部3子部1");
Department d26 = new Department(26, 17, "分部3子部1");
Department d27 = new Department(27, 17, "分部3子部1");
Department d28 = new Department(28, 17, "分部3子部1");
Department d29 = new Department(29, 17, "分部3子部1");
Department d30 = new Department(30, 18, "分部3子部1");
Department d31 = new Department(31, 18, "分部3子部1");
Department d32 = new Department(32, 18, "分部3子部1");
Department d33 = new Department(33, 18, "分部3子部1");
Department d34 = new Department(34, 18, "分部3子部1");
Department d35 = new Department(35, 26, "分部3子部1");
Department d36 = new Department(36, 26, "分部3子部1");
Department d37 = new Department(37, 27, "分部3子部1");
Department d38 = new Department(38, 27, "分部3子部1");
Department d39 = new Department(39, 28, "分部3子部1");
Department d40 = new Department(40, 28, "分部3子部1");
Department d41 = new Department(41, 30, "分部3子部1");
Department d42 = new Department(42, 30, "分部3子部1");
Department d43 = new Department(43, 31, "分部3子部1");
Department d44 = new Department(44, 32, "分部3子部1");
Department d45 = new Department(45, 33, "分部3子部1");
Department d46 = new Department(46, 33, "分部3子部1");
Department d47 = new Department(47, 34, "分部3子部1");
Department d48 = new Department(48, 35, "分部3子部1");
Department d49 = new Department(49, 37, "分部3子部1");
Department d50 = new Department(50, 44, "分部3子部1");
List<Department> list = new ArrayList<>();
list.add(d1);
list.add(d2);
list.add(d3);
list.add(d4);
list.add(d5);
list.add(d6);
list.add(d7);
list.add(d8);
list.add(d9);
list.add(d10);
list.add(d11);
list.add(d12);
list.add(d13);
list.add(d14);
list.add(d15);
list.add(d16);
list.add(d17);
list.add(d18);
list.add(d19);
list.add(d20);
list.add(d21);
list.add(d22);
list.add(d23);
list.add(d24);
list.add(d25);
list.add(d26);
list.add(d27);
list.add(d28);
list.add(d29);
list.add(d30);
list.add(d31);
list.add(d32);
list.add(d33);
list.add(d34);
list.add(d35);
list.add(d36);
list.add(d37);
list.add(d38);
list.add(d39);
list.add(d40);
list.add(d41);
list.add(d42);
list.add(d43);
list.add(d44);
list.add(d45);
list.add(d46);
list.add(d47);
list.add(d48);
list.add(d49);
list.add(d50);
Set<Integer> set = new HashSet<>();
set.add(37);
getChiildrenDepartment(list, set);
System.out.println(intset);
intset.addAll(set);
System.out.println(intset);
intset.clear();
getParentDepartmnet(list, set,37);
System.out.println(intset);
intset.addAll(set);
System.out.println(intset);
}
}
UDTF参考 scala
class ExpandTree2UDTF extends GenericUDTF {
var inputOIs: Array[PrimitiveObjectInspector] = null
val tree: collection.mutable.Map[String,Option[String]] = collection.mutable.Map()
override def initialize(args: Array[ObjectInspector]): StructObjectInspector = {
inputOIs = args.map{_.asInstanceOf[PrimitiveObjectInspector]}
val fieldNames = java.util.Arrays.asList("id", "ancestor", "level")
val fieldOI = primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector.asInstanceOf[ObjectInspector]
val fieldOIs = java.util.Arrays.asList(fieldOI, fieldOI, fieldOI)
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
def process(record: Array[Object]) {
val id = inputOIs(0).getPrimitiveJavaObject(record(0)).asInstanceOf[String]
val parent = Option(inputOIs(1).getPrimitiveJavaObject(record(1)).asInstanceOf[String])
tree += ( id -> parent )
}
def close {
val expandTree = collection.mutable.Map[String,List[String]]()
def calculateAncestors(id: String): List[String] =
tree(id) match { case Some(parent) => id :: getAncestors(parent) ; case None => List(id) }
def getAncestors(id: String) = expandTree.getOrElseUpdate(id, calculateAncestors(id))
tree.keys.foreach{ id => getAncestors(id).zipWithIndex.foreach{ case(ancestor,level) => forward(Array(id, ancestor, level)) } }
}
}