Requriments:
The Data Privacy needs only show first 10 characters of name of a person. Like the following example, I select all columns, but there is only 10 characters in name column. T
select * from employee limit 10;
OK
10001 Georgi Fac 88958.0
10002 Bezalel Si 72527.0
10003 Parto Bamf 43699.0
10004 Chirstian 74057.0
10005 Kyoichi Ma 94692.0
10006 Anneke Pre 60098.0
10007 Tzvetan Zi 88070.0
10008 Saniya Kal 52668.0
10009 Sumant Pea 94443.0
class ColumnMaskAuthorizerFactory
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.security.HiveAuthenticationProvider;
import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthorizer;
import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthorizerFactory;
import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthzPluginException;
import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthzSessionContext;
import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveMetastoreClientFactory;
public class ColumnMaskAuthorizerFactory implements HiveAuthorizerFactory {
@Override
public HiveAuthorizer createHiveAuthorizer(HiveMetastoreClientFactory metastoreClientFactory, HiveConf conf,
HiveAuthenticationProvider hiveAuthenticator,
HiveAuthzSessionContext ctx) throws HiveAuthzPluginException {
return new ColumnMaskAuthorizer();
}
}
class ColumnMaskAuthorizer
public class ColumnMaskAuthorizer implements HiveAuthorizer {
// omit unimplemented methods
@Override
public List<HivePrivilegeObject> applyRowFilterAndColumnMasking(HiveAuthzContext queryContext, List<HivePrivilegeObject> hiveObjs) throws SemanticException {
List<HivePrivilegeObject> ret = new ArrayList<HivePrivilegeObject>();
if(CollectionUtils.isNotEmpty(hiveObjs)) {
for (HivePrivilegeObject hiveObj : hiveObjs) {
HivePrivilegeObject.HivePrivilegeObjectType hiveObjType = hiveObj.getType();
if(hiveObjType == null) {
hiveObjType = HivePrivilegeObject.HivePrivilegeObjectType.TABLE_OR_VIEW;
}
boolean needToTransform = false;
if (hiveObjType == HivePrivilegeObject.HivePrivilegeObjectType.TABLE_OR_VIEW) {
String database = hiveObj.getDbname();
String table = hiveObj.getObjectName();
if (CollectionUtils.isNotEmpty(hiveObj.getColumns())) {
List<String> columnTransformers = new ArrayList<String>();
for (String column : hiveObj.getColumns()) {
boolean isColumnTransformed = addCellValueTransformerAndCheckIfTransformed(queryContext, database, table, column, columnTransformers);
needToTransform = needToTransform || isColumnTransformed;
}
hiveObj.setCellValueTransformers(columnTransformers);
}
}
if (needToTransform) {
ret.add(hiveObj);
}
}
}
return ret;
}
@Override
public boolean needTransform() {
return true;
}
}
addCellValueTransformerAndCheckIfTransformed checked only transform default.employee.name.
private boolean addCellValueTransformerAndCheckIfTransformed(HiveAuthzContext context, String databaseName, String tableOrViewName, String columnName, List<String> columnTransformers) throws SemanticException {
boolean ret = false;
String columnTransformer = columnName;
if ("default".equals(databaseName) && "employee".equals(tableOrViewName) && "name".equals(columnName)) {
columnTransformer = "substring(name, 1, 10)";
ret = true;
}
columnTransformers.add(columnTransformer);
return ret;
}
hive-site.xml Configuration
<property>
<name>hive.security.authorization.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.security.authorization.manager</name>
<value>xxx.ColumnMaskAuthorizerFactory</value>
</property>
The hive framework will rewrite the original query string.
- Original Query String
select * from employee limit 10;
- Rewritten Query String
select `employee`.`id`, `employee`.`name`, `employee`.`salary`
from (
SELECT
`id`, CAST(substring(name, 1, 10) AS string) AS `name`, `salary`, BLOCK__OFFSET__INSIDE__FILE, INPUT__FILE__NAME, ROW__ID
FROM `default`.`employee`
)
`employee` limit 10
Original query not include private column
If a only select id
column like this.
select id from employee limit 10;
- The rewritten query string
select id from (SELECT `id`, CAST(substring(name, 1, 10) AS string) AS `name`, `salary`, BLOCK__OFFSET__INSIDE__FILE, INPUT__FILE__NAME, ROW__ID FROM `default`.`employee` )`employee` limit 10