文本敏感词校验
背景: 用户社区评论或分享的文章或评论,需要合法合规,不能包含涉政等违规的铭感词.
使用阿里云的内容审核
准备工作
- 阿里云账号,需要开通敏感词服务 注意(有accessKeyId和accessKeySecret)都不行,须开通服务
- 文档地址: https://help.aliyun.com/document_detail/70439.html?spm=a2c4g.11186623.6.701.1cea2188bMuZUi
- 调试地址:https://next.api.aliyun.com/api/Green/2018-05-09/TextScan
java代码(做一次笔记吧qaq)
- 依赖
<dependency>
<groupId>com.aliyun</groupId>
<artifactId>aliyun-java-sdk-core</artifactId>
<version>4.5.16</version>
</dependency>
- 代码
@Slf4j
@Component
public class AliYunWordFilterHandler {
private static final String region = "cn-beijing";
@Value("${aliyun.oss.accessKeyId}")
private String accessKeyId;
@Value("${aliyun.oss.accessKeySecret}")
private String accessKeySecret;
//设置获取client
private IAcsClient getClient() {
IClientProfile profile = DefaultProfile.getProfile(region, accessKeyId, accessKeySecret);
//下面走的是阿帕奇的,自行选择
// DefaultProfile profile1 = DefaultProfile.getProfile(region, accessKeyId, accessKeySecret);
return new DefaultAcsClient(profile);
}
//设置请求头
private CommonRequest getDefaultCommonRequest() {
CommonRequest request = new CommonRequest();
request.setProtocol(ProtocolType.HTTPS);
request.setMethod(MethodType.POST);
request.setDomain("green.cn-beijing.aliyuncs.com");
request.setVersion("2018-05-09");
request.setUriPattern("/green/text/scan"); //垃圾扫描 request.setUriPattern("/green/text/feedback"); // 垃圾反馈
//设置超时时间
request.setSysConnectTimeout(6000);
request.setSysReadTimeout(6000);
request.putHeadParameter("Content-Type", "application/json");
return request;
}
//请求参数封装 map
private Map<String, Object> getExecuteMap(List<String> tasks) {
Map<String, Object> resultMap = new HashMap<>(2);
List<Map<String, Object>> inputBodyList = new ArrayList<>();
for (String task : tasks) {
Map<String, Object> requestBodyMap = new HashMap<String, Object>();
requestBodyMap.put("dataId", UUID.randomUUID().toString());
requestBodyMap.put("content", task); // 待检测的文本,长度不超过10000个字符
inputBodyList.add(requestBodyMap);
}
resultMap.put("scenes", Collections.singletonList("antispam")); // 检测场景,文本垃圾检测传递:antispam
resultMap.put("tasks", inputBodyList);
return resultMap;
}
//请求参数封装 JSONObject (linkhashMap)
private JSONObject getExecuteJSONObject(List<String> tasks) {
JSONObject resultMap = new JSONObject();
JSONArray inputBodyList = new JSONArray();
for (String task : tasks) {
JSONObject requestBody = new JSONObject();
requestBody.put("dataId", UUID.randomUUID().toString());
requestBody.put("content", task); // 待检测的文本,长度不超过10000个字符*/
inputBodyList.add(requestBody);
}
resultMap.put("scenes", Collections.singletonList("antispam")); // 检测场景,文本垃圾检测传递:antispam
resultMap.put("tasks", inputBodyList);
return resultMap;
}
//批量效验,阿里的api 详情批量限制100个,单个长度不能超过10000
@TimerLog
public List<AuditInfo> textReviews(List<String> content) {
List<AuditInfo> result = new LinkedList<>();
IAcsClient client = getClient();
CommonRequest request = getDefaultCommonRequest();
Map<String, Object> executeMap = getExecuteMap(content);
log.info("阿里敏感词检测:[start]:\n {}", JSONUtils.toString(executeMap));
request.setHttpContent(JSONUtils.toString(executeMap).getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8.name(), JSON);
try {
CommonResponse response = client.getCommonResponse(request);
log.info("阿里敏感词检测:[end]:\n {}", response.getData());
HttpResponse httpResponse = response.getHttpResponse();
if (httpResponse.isSuccess()) {
log.info("敏感词效验 成功");
String responseContent = new String(httpResponse.getHttpContent(), StandardCharsets.UTF_8);
JSONObject scrResponse = JSONUtils.parseObject(responseContent, JSONObject.class);
if (200 == scrResponse.getInteger("code")) {
JSONArray taskResults = scrResponse.getJSONArray("data");
for (int i = 0; i < taskResults.size(); i++) {
JSONObject taskResultObj = taskResults.getJSONObject(i);
AuditInfo auditInfo = new AuditInfo();
//如果被检测文本命中了自定义关键词词库中的关键词,则会返回当前字段,并将命中的关键词替换为星号(*)。
String filteredContent = taskResultObj.getString("filteredContent");
auditInfo.setContent(filteredContent);
if (200 == taskResultObj.getInteger("code")) {
JSONArray sceneResults = taskResultObj.getJSONArray("results");
for (int j = 0; j < sceneResults.size(); j++) {
JSONObject taskSubObject = sceneResults.getJSONObject(j);
//这里检测只使用一个 result 结果,检测文本为一个
String scene = taskSubObject.getString("scene");
//pass:文本正常,可以直接放行。 review:文本需要进一步人工审核。 block:文本违规,可以直接删除或者限制公开。
//@see https://help.aliyun.com/document_detail/70439.html?spm=a2c4g.11186623.6.701.1cea2188bMuZUi
String suggestion = taskSubObject.getString("suggestion");
String label = taskSubObject.getString("label");
double rate = taskSubObject.getDouble("rate");
auditInfo = convertMsg(label, auditInfo, rate);
result.add(auditInfo);
}
} else {
System.out.println("task process fail:" + taskResultObj.getInteger("code"));
log.error("阿里敏感词检测:请求超时!");
}
}
} else {
log.error("检测状态失败 code:{}", scrResponse.getInteger("code"));
}
}
} catch (ClientException e) {
log.error("请求调用失败,检查是否是超时");
e.printStackTrace();
}
return result;
}
//标签效验
private AuditInfo convertMsg(String label, AuditInfo audit, double rate) {
//正常放行 normal:正常文本 spam:含垃圾信息 ad:广告 flood:灌水 meaningless:无意义 customized:自定义(例如命中自定义关键词)
//拦截 politics:涉政 terrorism:暴恐 abuse:辱骂 porn:色情 contraband:违禁
audit.setResult(true);
audit.setMsg("审核正常");
switch (label) {
case "normal":
break;
case "spam":
if (rate > 50.0) {
audit.setResult(true);
audit.setMsg("含垃圾信息");
}
break;
case "ad":
if (rate > 50.0) {
audit.setResult(true);
audit.setMsg("广告");
}
break;
case "politics":
audit.setResult(false);
audit.setMsg("涉政");
break;
case "terrorism":
audit.setResult(false);
audit.setMsg("暴恐");
break;
case "abuse":
if (rate > 70.0) {
audit.setResult(false);
audit.setMsg("辱骂");
}
break;
case "porn":
if (rate > 90.0) {
audit.setResult(false);
audit.setMsg("色情");
}
break;
case "flood":
if (rate > 95.0) {
audit.setResult(true);
audit.setMsg("灌水");
}
break;
case "contraband":
audit.setResult(false);
audit.setMsg("违禁");
break;
case "meaningless":
if (rate > 95.0) {
audit.setResult(true);
audit.setMsg("无意义");
}
break;
case "qrcode":
if (rate > 60.0) {
audit.setResult(true);
audit.setMsg("二维码");
}
break;
default:
audit.setResult(true);
audit.setMsg("自定义");
break;
}
return audit;
}
/**
* @author: craywen
* @date: 2021-05-25 14:23
* @desc: 审核结果
*/
@Data
public class AuditInfo {
/**
* 审核结果
*/
private boolean result;
/**
* 返回的消息
*/
private String msg;
/**
* 内容
*/
private String content;
}