最近做华为的比赛,需要对数据进行清洗,查阅资料后选择了箱形图的方法,挑选出的异常值,用线性插值,即用邻近的两个数的和除以2替换,下面是代码实现。
import java.util.ArrayList;
public class BoxPlot {
/**
* 箱形图
* @param data
*/
public static double[] boxPlot(double[] data){
ArrayList<Integer> removeList= new ArrayList<Integer>();
ArrayList<Double> dataList= new ArrayList<Double>();
double[] dataClone = data.clone();
sort(data,0,data.length-1);
//中位数
double index2 = (double)data.length/2;
double Q2 = ( ((int)index2+1- index2 )*data[(int)index2] + (index2 -(int)index2)*data[(int)index2+1]);
//下四分位数
double index1 = (double)data.length/4;
double Q1 = ( (int)index1+1- index1 )*data[(int)index1] + (index1 -(int)index1)*data[(int)index1+1];
//上四分位数
double index3 = (double)data.length*3/4;
double Q3 = ( (int)index3+1- index3 )*data[(int)index3] + (index3 -(int)index3)*data[(int)index3+1];
int t=0;
//如果上下四分位数相同,则改变index3
while(Q3==Q1){
int index = (int)index3+t;
if(index==data.length){
return dataClone;
}
Q3 = data[(int)index3+t];
t++;
}
double IQR = Q3-Q1;
double k =2.8;
double upperLimit = Q3+k*IQR;//上限
double lowerLimit = Q1-k*IQR;//下限
// System.out.println("Q1:"+Q1+" Q2:"+Q2+" Q3:"+Q3+" uLim:"+upperLimit+" lLim:"+lowerLimit);
double sum = 0;
int count = 0;
/**=================================补上均值=========================================================***/
// for(int i=0;i<dataClone.length;i++){
//
// if(dataClone[i]>upperLimit || dataClone[i]<lowerLimit){
// dataClone[i] = -1;//标记异常值
// }else{
// sum += dataClone[i];
// count++;
// }
// }
// double ave = sum/count;
// //填入均值
// for(int i=0;i<dataClone.length;i++){
//
// if(dataClone[i] == -1)
// dataClone[i] = ave;
// }
/**=================================补上中位数=========================================================***/
for(int i=0;i<dataClone.length;i++){
dataList.add(dataClone[i]);
}
// System.out.println("删除的元素:");
for(int i=dataList.size()-1;i>=0;i--){
double tmp = dataList.get(i);
if(tmp>upperLimit || tmp<lowerLimit){
// System.out.print(tmp+" ");
removeList.add(i);//添加删除的index,index从大到小,5,4,1
dataList.remove(i);
}
}
System.out.println();
if(removeList.size()>0){
//补上元素,倒序 ,index从小到大的添加,1,3,5
for(int i=removeList.size()-1;i>=0;i--){
double addElem;
int tmp = removeList.get(i);//index
// System.out.println("删除的位置index:"+tmp);
if(tmp==dataList.size()){//说明删除的是最后一个元素
addElem = dataList.get(tmp-1);//补上前一个元素
dataList.add(tmp, addElem);
}else if(tmp==0){
addElem = dataList.get(tmp);
// System.out.println("index:"+tmp+","+dataList.get(tmp));
dataList.add(tmp, addElem);
}else{
addElem = dataList.get(tmp-1)+dataList.get(tmp);
// System.out.println("删除的位置index:"+tmp+","+dataList.get(tmp-1)+","+dataList.get(tmp));
addElem /=2;
dataList.add(tmp, addElem);
}
// System.out.println("补上的元素是:"+addElem);
}
}
double[] res = new double[dataList.size()];
for(int i=0;i<res.length;i++){
res[i] = dataList.get(i);
}
return res;
}
public static double[] sort(double[] a,int low,int high){
int mid = (low+high)/2;
if(low<high){
sort(a,low,mid);
sort(a,mid+1,high);
//左右归并
merge(a,low,mid,high);
}
return a;
}
public static void merge(double[] a, int low, int mid, int high) {
double[] temp = new double[high-low+1];
int i= low;
int j = mid+1;
int k=0;
// 把较小的数先移到新数组中
while(i<=mid && j<=high){
if(a[i]<a[j]){
temp[k++] = a[i++];
}else{
temp[k++] = a[j++];
}
}
// 把左边剩余的数移入数组
while(i<=mid){
temp[k++] = a[i++];
}
// 把右边边剩余的数移入数组
while(j<=high){
temp[k++] = a[j++];
}
// 把新数组中的数覆盖nums数组
for(int x=0;x<temp.length;x++){
a[x+low] = temp[x];
}
}
}