TinkerPop提供以Spark为引擎批量检索图数据:
实现思路:
检索hbase数据–》构建graphRDD --》 缓存rdd到内存 --》 获取缓存rdd数据(sc.getPersistendRdd) --》其他操作;
1,配置文件:
配置信息管理类:org.apache.tinkerpop.gremlin.hadoop.Constants
####################################################
#### Gremlin hadoop configuration ####
####################################################
gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph;读取数据后,构建为HadoopGraph对象,基于该图对象后续操作
gremlin.hadoop.graphReader=org.janusgraph.hadoop.formats.hbase.HbaseInputFormat;//读取Hbase数据
gremlin.hadoop.graphWriter==org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD; //图RDD缓存到内存中;
gremlin.hadoop.defaultGraphComputer=org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer;//指定默认的点执行引擎
gremlin.hadoop.inputLocation=none //设置如果加载本地数据文件夹,文件路径
gremlin.hadoopOutputLocation=output //设置缓存到本地或hdfs时,文件路径
gremlin.hadoop.jarsInDistributedCache=true //设置spark任务以来jar文件是否缓存
gremlin.spark.persistContext=false //设置sparkContext 是否缓存,true则缓存,false则退出
gremlin.spark.graphStorageLevel=MEMORY_AND_DISK //设置graph存储时等级
#gremlin.spark.persistStorageLevel=MEMORY_ONLY //设置graph缓存事等级
####################################################
#### JanusGraph Hbase Inputformat configuration ####
####################################################
janusgraphmr.ioformat.conf.storage.backend=hbase
janusgraphmr.ioformat.conf.storage.hostname=10.121.71.146
janusgraphmr.ioformat.conf.storage.hbase.ext.zookeeper.znode.parent=/hbase-secure
janusgraphmr.ioformat.conf.storage.hbase.table=dedault_gods_graph //设置读取hbase表名称
####################################################
#### build spark computer configuration ####
####################################################
spark.master=yarn
spark.depoly-mode=cluster
spark.driver.cores=1
spark.driver.memory=1
spark.executor.memory=2
spark.executor.cores=1
spark.executor.instances=3
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.kryo.registrator=org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoRegistrator
....
2,构建HadoopGraph对象
//加载配置文件信息
public Graph generateGraph(FilePath:String){
Configuration configuration = super.getBaseConfiguration();
Graph graph = GraphFactory.open(configuration);
return graph;
}
3,构建TraversalVertexProgram
//设置limit(1).count()设置写入driver累加器中的数据量大小;
//GraphTraversal.Admin:触发数据检索,返回结果无实际意义
public TraversalVertexProgram generateVertexProgram(){
GraphTraversal.Admin[Veretex,Long] traversal = graph.traversal().withComputer(classOf[SparkGraphComputer]).V().limit(1).count().asAdmin()
TraversalVertexProgram vertexProgram = TraversalVertexProgram.build().traversal(traversal).create(graph)
return vertexProgram
}
4,构建GraphComputer
GraphComputer computer = graph.compute(classOf[SparkGraphComputer])
//.skipGraphCache() 设置是否跳过Graph缓存 可选
//.skipPartitioner() 设置是否跳过重新分区 可选
.result(GraphComputer.ResultGraph.NEWS)) //设置ResultGraph返回是新图还是原图:new 返回新图,original:返回原图
.persist(GraphComputer.Persist.EDGES) //设置缓存Graph时,缓存数据:NOTHING:不缓存、VERTEXS:
.program(vertexProgram)
.workers(Int) //设置线程数,该参数设置GraphRdd partion数
//其他参数可选
.edges(edgeTraversal) //边过滤条件;需要Traversal<Vertex,Edge>对象
.vertices(vertexTraversal)//点过滤条件;需要Traversal<Vertex,Vertex>对象
//构建边过滤条件对象,通过ScriptTraversal对象,通过gremlin语句构建,增加灵活性
String gremlinSql = "__.bothE().hasLabel('god')"; //该gremlin语句,必须以__开头
Traversal<Vertex,Edge> edgeTraversal = new ScriptTraversal[Vertex,Edge](graph.traversal,"gremlin-groovy",gremlinSql)
//构建点过滤条件对象
String vertexGremlinSql = "__.V().hasLabel('god')"
Traversal<Vertex,Vertex> vertexTraversal = new ScriptTraversal[Vertex,Veretex](graph.traversal,"gremlin-groovy",vertexGremlinSql)
5,提交任务,并获取数据,进行转换
//提交任务,并获取返回结果
ComputerResult result = computer.submit().get()
//从内存中读取缓存RDD,并读取数据
//读取返回RDD时,返回值时Map,在这里获取第一个值,并获取value,value即为缓存的RDD
RDD[_] persistedRdd = sparkContext.getPersistentRDDs.head._2
//从RDD中获取数据,进行转换
//点转换缓存RDD结构为[Tuple2[vertexID,Veretex]];顶点id,顶点对象[顶点,边对象]
val vertexRdd = persistedRdd.map(value =>{
val vertexId:Long = value.asInstanceOf[Tuple2[Long,VertexWritable]]._1
(vertexId,1)
})
val edgeRdd = persistedRdd.flatMap(value =>{
//edges时,涉及 IN:入边、OUT:出边、BOTH:出边和入边
val edges = value.asInstanceOf[Tuple[Int,VertexWritable]]._2.asInstanceOf[VertexWritable].get().edges(Direction.In)
edges.asScala.map(edgeTmp=>{
val srcId = edgeTmp.outVertex().id().asInstanceOf[Long]
val dstId = edgeTimp.inVertex().id().asInstanceOf[Long]
graphx.Edge(srcId,dstId,"")
})
})
//基于以上边RDD和点RDD,可构建GraphX支持的图结构进行分析
补充: 该方式依赖jar包:
javatuples-1.2.jar
commons-pool2-2.6.2.jar
commons-configuration-1.10.jar
gremlin-core-3.4.6.jar
hadoop-gremlin-3.4.6.jar
spark-gremlin-3.4.6.jar
gremlin-shaded-3.4.6.jar
gremlin-grovny-3.4.6.jar
tinkergraph-gremlin-3.4.6.jar
janusgraph-hadoop-0.5.2.jar
jaunsgraph-hadoop-0.5.2.jar
jaunsgraph-core-0.5.2.jar
jaunsgraph-driver-0.5.2.jar
jaunsgraph-hbase-0.5.2.jar
jaunsgraph-es-0.5.2.jar
hbase-shaded-mapreduce-2.1.5.jar
reflections8-0.11.7.jar
elasticsearch-rest-client-6.8.8.jar
httpasyncclient-4.1.4.jar
httpcore-nio-4.4.13.jar
high-scale-lib-1.1.4.jar
spatial4j-0.7.jar
jts-core-1.16.1.jar
groovy-2.5.7-indy.jar
groovy-jsr223-2.5.7-indy.jar
caffeine-2.8.2.jar