本文接《基于机器学习的SNS隐私保护策略推荐向导的设计与实现》,详细解析基于机器学习的SNS隐私策略推荐向导分类器的C++及WEKA实现与评估结果,本文完整C++程序及JAVA工程下载链接见点击打开链接,对数据挖掘和SNS感兴趣的朋友可以下载跑一下,有任何问题欢迎交流:)
基于机器学习的SNS隐私策略推荐向导分类器的C++及WEKA实现与评估
1 SNS朋友数据预处理与统计
要实现对朋友访问权限的自动分类,首先需要对朋友的数据进行预处理。预处理主要包括向量化和格式化输出。格式化输出主要是针对使用的数据挖掘开源程序包,WWW10’原文中实验时采用的是RapidMiner,主要使用了其中的朴素贝叶斯、决策树及KNN算法的实现。本文中SNS隐私向导分类器的实现主要基于WEKA,同样是非常著名的数据挖掘开源程序包。WEKA支持命令行、GUI、程序API等多种调用方式。为了让WEKA成功读取样本数据,首先得知道WEKA对样本数据格式的规定,如图7-1所示,给出了本项目训练样本数据文件格式,以WEKA读取数据格式ARFF文件保存。
SNS朋友向量化的JAVA实现如下
package com.pku.yangliu;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
/**Compute the vector of friends in arff format
* @author yangliu
* @qq 772330184
* @mail yang.liu@pku.edu.cn
* @blog http://blog.csdn.net/yangliuy
*/
public class ComputeFriendsVector {
public static String dataPath = "data/";
public static String resPath = "friendvec/";
public static String communityFile = "friendvec/community.out.txt";
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
File[] dataFiles = new File(dataPath).listFiles();
String line;
for(int i = 0; i < dataFiles.length; i++){
BufferedReader dataFileReader = new BufferedReader(new InputStreamReader(new FileInputStream(dataFiles[i]), "UTF-8"));
BufferedReader communityFileReader = new BufferedReader(new InputStreamReader(new FileInputStream(communityFile), "UTF-8"));
String resFile = resPath +"vec_" +dataFiles[i].getName()+".arff";
FileWriter resFileWriter = new FileWriter(resFile);
resFileWriter.append("@relation " + dataFiles[i].getName() + "_friends" + "\n\n");
//先写出arf文件头信息
writeArffHeader(resFileWriter);
int count = 0;
HashMap<String,String> userProfile = new HashMap<String,String>();
HashMap<String,String> friendProfile = new HashMap<String,String>();
HashSet<String> birthdays = new HashSet<String>();
String communityLine = communityFileReader.readLine();//第一行数据不要,是用户的圈子信息
communityLine = communityFileReader.readLine();
while((line = dataFileReader.readLine()) != null){
count++;
if(count == 1){
System.out.print(count + " ");
userProfile = transToMap(line);
continue;
}else{
friendProfile = transToMap(line);
//基于frindProfile统计出现过的所有出生年份,写入arff文件头部
birthdays = countBirthdays(birthdays, friendProfile);
line = generateVecLine(friendProfile, userProfile);
resFileWriter.append(line + communityLine + "," + friendProfile.get("permission")+"\n");
System.out.println(line +" haha " + communityLine + "," + friendProfile.get("permission"));
communityLine = communityFileReader.readLine();
}
System.out.print(count + " ");
}
resFileWriter.flush();
resFileWriter.close();
System.out.println(birthdays.size());
for(String birth : birthdays){
System.out.print(birth + ",");
}
System.out.println();
}
System.out.println("done");
}
/**Count all the types of birthday
* @param friendProfile
* @param resFileWriter
* @return Vector<String>
* @throws IOException
*/
private static HashSet<String> countBirthdays(HashSet<String> birthdays, HashMap<String, String> friendProfile) {
// TODO Auto-generated method stub
if(friendProfile.containsKey("birthday")){
String year[] = friendProfile.get("birthday").split("[^0-9]");
birthdays.add(year[0]);
}
return birthdays;
}
/**Write the header of arff file
* @param resFileWriter
* @throws IOException
*/
private static void writeArffHeader(FileWriter resFileWriter) throws IOException {
// TODO Auto-generated method stub
resFileWriter.append("@attribute gender {0,1}\n");
resFileWriter.append("@attribute birthday numeric\n");
resFileWriter.append("@attribute hometown {0,1,2}\n");
resFileWriter.append("@attribute college {0,1}\n");
resFileWriter.append("@attribute highschool {0,1}\n");
resFileWriter.append("@attribute middleschool {0,1}\n");
resFileWriter.append("@attribute primaryschool {0,1}\n");
resFileWriter.append("@attribute G1 {0,1}\n");
resFileWriter.append("@attribute G2 {0,1}\n");
resFileWriter.append("@attribute G3 {0,1}\n");
resFileWriter.append("@attribute G4 {0,1}\n");
resFileWriter.append(