spark链接ldap获取数据
- 依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
<version>2.6.0-cdh5.5.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-maven-plugins</artifactId>
<version>2.6.0-cdh5.5.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
<type>jar</type>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.9</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.4</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.4</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.25</version>
<scope>provided</scope>
</dependency>
- MySQL工具类
package com.lenovo.Utils
class MysqlSession {
//云测试环境
val url = "jdbc:mysql://10.122.64.84:3306/test?rewriteBatchedStatements=true&useUnicode=true&characterEncoding=UTF-8"
val user = "root"
val password = "test"
val driver = "com.mysql.jdbc.Driver"
}
- 代码
package com.lenovo.ldap
import java.text.SimpleDateFormat
import java.util
import java.util.{Date, Properties}
import com.lenovo.Utils.MysqlSession
import javax.naming.directory.{Attribute, SearchControls}
import javax.naming.{AuthenticationException, Context, NamingEnumeration, NamingException}
import javax.naming.ldap._
import org.apache.commons.codec.binary.Base64
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
object GetDataFromLDAP {
def main(args: Array[String]): Unit = {
val ss = SparkSession
.builder()
.master("yarn")
.appName("GetDataFromLDAP")
//.config("spark.sql.warehouse.dir", "file:///E://lenovo_pj//upp//upp_big_data//upp//recommend")
.getOrCreate()
val ms = new MysqlSession
val tb_pw = "ad_user_ldap_password"
val conn = Map("url" -> ms.url, "dbtable" -> tb_pw, "user" -> ms.user, "password" -> ms.password, "driver" -> ms.driver)
val ldapPW = ss.read.options(conn).format("jdbc").load().head().get(0).toString
val ldapUser = "ladp01"
val ldapContext = ldapConnect(ldapUser,ldapPW)
//ldap搜索字段
val searchWords = Array(
"name" ,"l" ,"c",
"mail" ,"title" ,"displayName",
"employeeType" ,"department" ,"manager",
"physicalDeliveryOfficeName" ,"mobile" ,"telephoneNumber",
"pwdLastSet" ,"whenCreated" ,"accountExpires",
"userAccountControl" ,"company" ,"otherPager",
"memberOf" ,"msRTCSIP-PrimaryUserAddress" ,"userPrincipalName",
"msExchExtensionAttribute16" ,"thumbnailPhoto"
)
//
//mysql数据库除了ou之外所有的column名字
val mysqlColumn = Array(
"itcode" ,"city" ,"country",
"mail" ,"title" ,"display_name",
"employee_type" ,"department" ,"manager",
"office_address" ,"mobile_phone" ,"work_phone",
"pwd_last_set" ,"when_created" ,"account_expires",
"user_account_control" ,"company" ,"external_email",
"member_of" ,"msrtcsip_primary_user_address","user_principal_name",
"local_name" ,"thumbnail_photo"
)
//
val fields = new ListBuffer[StructField]()
for(col <- mysqlColumn){
fields += StructField(col,StringType)
}
fields += StructField("ou",StringType)
val schema = StructType(fields)
val prop = new Properties()
prop.setProperty("user",ms.user)
prop.setProperty("password",ms.password)
prop.setProperty("driver",ms.driver)
prop.setProperty("charset","UTF-8")
//用户搜索的过滤表达式(所有用户账号)
val filter = "sAMAccountName=*"
//ldap搜索的根目录名(ou)在职员工
val ouArr = Array("AG","AP","China","EMEA","Motorola")
//ldap搜索的根目录名(ou)离职员工
//val ouArr = Array("disabled accounts")
//遍历ou读取数据并写到MySQL数据库
//目录树查询参数设置
val searchControls = new SearchControls
//设置搜索范围(所有目录树的子树)
searchControls.setSearchScope(SearchControls.SUBTREE_SCOPE)
//设置搜索字段
searchControls.setReturningAttributes(searchWords)
val baseArr = ArrayBuffer[Map[String, Any]]()
for(ou <- ouArr){
//用户目录com/lenovo/User Accounts/+ ou 在职员工
val baseDir = "OU=" + ou +",OU=User Accounts,DC=lenovo,DC=com"
//CN=0065lo,OU=disabled accounts,DC=lenovo,DC=com 离职员工
//val baseDir = "OU=" + ou +",DC=lenovo,DC=com"
val pageSize = 1000
var cookie: Array[Byte] = null
var num = 0
//循环检索数据页
try {
//分页读取请求控制
ldapContext.setRequestControls(Array[Control](new PagedResultsControl(pageSize,Control.CRITICAL)))
do {
val results = ldapContext.search(baseDir, filter, searchControls)
//循环检索每一页数据
while (results != null && results.hasMore) {
num = num + 1
val attributes = results.next().getAttributes
if (attributes.size() > 0) {
val all: NamingEnumeration[_ <: Attribute] = attributes.getAll
//循环检索每一行数据所有属性
baseArr += lineForeach(all)
}
if(num >= 2000){
val value = ss.sparkContext
.parallelize(baseArr)
.map(m => {
val arr = ArrayBuffer[String]()
for (word <- searchWords) {arr += m.getOrElse(word,"").toString}
arr += ou
Row.fromSeq(arr)
})
ss.createDataFrame(value,schema)
.write
.mode(SaveMode.Append)
.jdbc(ms.url,"ad_user_ldap",prop)
//重设参数
num = 0
//清空数组
baseArr.clear()
}
}
cookie = checkCookie(ldapContext.getResponseControls)
ldapContext.setRequestControls(Array[Control](new PagedResultsControl(pageSize,cookie,Control.CRITICAL)))
} while (cookie != null && cookie.length != 0)
} catch {
case e:Exception => e.printStackTrace()
}
val value = ss.sparkContext
.parallelize(baseArr)
.map(m => {
val arr = ArrayBuffer[String]()
for (word <- searchWords) {arr += m.getOrElse(word,"").toString}
arr += ou
Row.fromSeq(arr)
})
ss.createDataFrame(value,schema)
.write
.mode(SaveMode.Append)
.jdbc(ms.url,"ad_user_ldap",prop)
//清空数组
baseArr.clear()
}
//关闭ldap链接
ldapContext.close()
ss.stop()
}
/**
* 建立LDAP链接
* @param user ldap账号
* @param password ldap账号密码
* @return LdapContext(LDAP链接)
*/
def ldapConnect(user: String, password: String):LdapContext = {
val url = "ldap://lenovo.com"
val factory = "com.sun.jndi.ldap.LdapCtxFactory"
val env = new util.Hashtable[String,String]()
env.put(Context.INITIAL_CONTEXT_FACTORY, factory)
env.put(Context.SECURITY_AUTHENTICATION, "simple")
env.put(Context.PROVIDER_URL, url)
env.put(Context.SECURITY_PRINCIPAL, user)
env.put(Context.SECURITY_CREDENTIALS, password)
env.put(Context.BATCHSIZE, "1000")
var ldapContext: LdapContext = null
try {
ldapContext = new InitialLdapContext(env, null)
println("ldap connect success")
} catch {
case _:AuthenticationException => println("ldap认证失败")
case _:NamingException => println("ldap参数有误导致连接失败")
}
ldapContext
}
/**
* 循环检索每一行数据所有属性
* @param attrAll ldap属性数组
* @return
*/
def lineForeach(attrAll: NamingEnumeration[_ <: Attribute]):Map[String,Any]={
val sdf_from = new SimpleDateFormat("yyyyMMddhhmmss")
val sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss")
var row:Map[String,Any] = Map()
while (attrAll.hasMore) {
val attr = attrAll.next()
val key = attr.getID
var value = attr.get()
if ("manager".equals(key)) {
value = value.toString.split(",")(0).split("=")(1)
} else if ("whenCreated".equals(key)) {
value = sdf.format(sdf_from.parse(value.toString.substring(0, 14)))
} else if ("pwdLastSet".equals(key)) {
value = timeChange(value.toString, sdf)
} else if ("accountExpires".equals(key)) {
if (value.toString.length == 18) value = timeChange(value.toString, sdf)
} else if ("memberOf".equals(key)) {
val all = attr.getAll
var v = ""
while (all != null && all.hasMore) {
if ("".equals(value)) {
v = all.next().toString
} else {
v += "|" + all.next().toString
}
}
value = v
}else if("thumbnailPhoto".equals(key)){
value = "data:image/jpeg;base64,"+bytesToBase64(value.asInstanceOf[Array[Byte]])
}
row += (key -> value.toString)
}
row
}
/**
* byte数组转换成十六进制(bytesToHexString)
* @param b_arr byte数组
* @return
*/
def bytesToBase64(b_arr:Array[Byte]): String ={
Base64.encodeBase64String(b_arr)
}
/**
* 用于判断分页查询生成的唯一标识cookie,进而判断是否查询完数据
* @param controls 分页控制cookie
* @return
*/
def checkCookie(controls:Array[Control]):Array[Byte] ={
var cookie: Array[Byte] = null
if(controls.length != 0) for (c <- controls) {
if (c.isInstanceOf[PagedResultsResponseControl])
cookie = c.asInstanceOf[PagedResultsResponseControl].getCookie
}
cookie
}
/**
* 时间格式转换:
* ldap的时间戳是从1601年1月1日0时起经过的1E-7秒(即100纳秒)的个数(时间是GMT的,中国的北京的时间需要加上8个小时)
* 例如:以lastLogon、pwdLastSet、accountExpires等属性为代表(输出没有'Z'结尾)
* @param ldapTime ldap时间戳
* @param sdf 时间戳格式化格式
* @return
*/
def timeChange(ldapTime: String, sdf: SimpleDateFormat):String = {
val BJTime = ldapTime.toLong/10000 - 11644473600125L
sdf.format(new Date(BJTime))
}
}