MapReduce之皮儿逊相关系数
什么是皮儿逊相关系数
皮尔逊(Pearson)相关系数可以度量两个数据集的相关关系,基本来说,皮尔逊相关系数可以解答这样的问题:能不能画折线图来表示数据。
皮尔逊相关系数公式
皮尔逊相关系数的计算公式有很多不同的等价形式。令
x
=
(
x
1
,
x
2
,
…
,
x
n
)
,
y
=
(
y
1
,
y
2
,
…
,
y
n
)
x=(x_1,x_2,\dots,x_n),y=(y_1,y_2,\dots,y_n)
x=(x1,x2,…,xn),y=(y1,y2,…,yn),则x和y的皮尔逊相关系数可表述为:
r
=
∑
(
x
i
−
x
‾
)
(
y
i
−
y
‾
)
∑
(
x
i
−
x
‾
)
2
∑
(
y
i
−
y
‾
)
2
r=\frac{\sum(x_i-\overline{x})(y_i-\overline{y})}{\sqrt{\sum(x_i-\overline{x})^2\sum(y_i-\overline{y})^2}}
r=∑(xi−x)2∑(yi−y)2∑(xi−x)(yi−y)
其中:
x
‾
=
∑
x
n
\overline{x}=\frac{\sum{x}}{n}
x=n∑x
y
‾
=
∑
y
n
\overline{y}=\frac{\sum{y}}{n}
y=n∑y
皮尔逊相关系数有以下性质:
- 范围为-1.00$\leq r \leq$1.00
- 相关系数是两个变量间相关联强度的一个无量纲指标: r > 0 r > 0 r>0表示正相关, r < 0 r < 0 r<0 表示负相关, r = 0 r=0 r=0表示没有关联
- 度量x和y之间的线性关系
- 绝对值越大表示相关性越大
MapReduce解决方案
输入数据集
1,1,3,-1
2,2,1,-2
3,3,8,-3
map阶段任务
该阶段的任务主要是去获取数据,进而计算每行数据之间的相关系数
map阶段编码
public class PearsonMapper extends Mapper<LongWritable,Text,Tuple2,Tuple2>{
private Tuple2<Integer,Integer> reducekey;
private Tuple2<Double,Double> reduceVaue;
public void map(LongWritable key, Text value, Mapper.Context context){
String[] line=value.toString().split(",");
int size=line.length;
System.out.println("size is "+size);
double[] arr=new double[size];
for(int i=0;i<size;i++){
arr[i]=Double.parseDouble(line[i]);
}
try{
for(int i=0;i<size-1;i++){
for(int j=i+1;j<size;j++){
reducekey=new Tuple2(i,j);
reduceVaue=new Tuple2(arr[i],arr[j]);
context.write(reducekey,reduceVaue);
System.out.println(reducekey.toString()+" "+reduceVaue.toString());
}
}
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
其中Tuple2类设计如下:
public class Tuple2<T1,T2> implements Writable, WritableComparable<Tuple2<T1,T2>> {
private T1 _1;
private T2 _2;
public Tuple2() {
}
public Tuple2(T1 _1, T2 _2) {
set(_1, _2);
}
private void set(T1 s1, T2 s2) {
_1 = s1;
_2 = s2;
}
public T1 first() {
return _1;
}
public void setFirst(T1 _1) {
this._1 = _1;
}
public T2 second() {
return _2;
}
public void setSecond(T2 _2) {
this._2 = _2;
}
public int compareTo(Tuple2 o) {
return 0;
}
public void write(DataOutput dataOutput) throws IOException {
Text.writeString(dataOutput, String.valueOf(_1));
Text.writeString(dataOutput, String.valueOf(_2));
}
public void readFields(DataInput dataInput) throws IOException {
_1 = (T1) Text.readString(dataInput);
_2 = (T2) Text.readString(dataInput);
}
public String toString() {
StringBuffer sb = new StringBuffer("Tuple2[");
sb.append(_1).append(",").append(_2);
return sb.append("]").toString();
}
}
reduce阶段任务
该阶段主要是将 ( i , j ) , 0 ≤ i ≤ s i z e , 0 ≤ j ≤ s i z e (i,j),0\le i \le size,0 \le j \le size (i,j),0≤i≤size,0≤j≤size下的所有数据进行收集并计算其相关系数。
reduce阶段编码
public class PearsonReducer extends Reducer<Tuple2<Integer,Integer>,Tuple2<Double,Double>,Tuple2,Text> {
private Tuple2<Integer,Integer> reduceKey;
public void reduce(Tuple2<Integer,Integer> key, Iterable<Tuple2<Double,Double>> values,Context context){
double x=0.0d;
double y=0.0d;
double xx=0.0d;
double yy=0.0d;
double xy=0.0d;
double n=0.0d;
double first,second;
try{
for(Tuple2<Double,Double> pair:values){
first=Double.parseDouble(String.valueOf(pair.first()));
second=Double.parseDouble(String.valueOf(pair.second()));
x+=first;
y+=second;
xx+=Math.pow(first,2.0d);
yy+=Math.pow(second,2.0d);
xy+=(first*second);
n+=1.0d;
System.out.println(pearsonCalculate(x,y,xx,yy,xy,n));
reduceKey=key;
context.write(reduceKey,new Text("r = "+pearsonCalculate(x,y,xx,yy,xy,n)));
}
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private static double pearsonCalculate(double x,double y,double xx,
double yy,double xy,double n){
double numerator=xy-((x*y)/n);
double denominator1=xx-(Math.pow(x,2.0d)/n);
double denominator2=yy-(Math.pow(y,2.0d)/n);
double denominator=Math.sqrt(xx*yy);
double correlation=numerator/denominator;
return correlation;
}
}
驱动程序如下
public class PearsonDriver {
public static void main(String[] args) {
try{
Configuration conf=new Configuration();
String[] otherArgs=new String[]{"input/Pearson.txt","output"};
if(otherArgs.length!=2){
System.out.println("参数错误");
System.exit(1);
}
Job job=new Job(conf,"PearsonDriver");
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
job.setJarByClass(PearsonDriver.class);
job.setMapperClass(PearsonMapper.class);
job.setReducerClass(PearsonReducer.class);
job.setOutputKeyClass(Tuple2.class);
job.setOutputValueClass(Tuple2.class);
System.exit(job.waitForCompletion(true)?0:1);
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}