矩阵乘法的Map-Reduce实现

最新推荐文章于 2023-06-15 15:27:59 发布

猪猪奋斗记

最新推荐文章于 2023-06-15 15:27:59 发布

阅读量2.4k

点赞数 2

分类专栏： Spark 分布式文章标签： map-reduce spark 矩阵乘法

本文链接：https://blog.csdn.net/bigbigship/article/details/52298311

版权

分布式同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

Spark

1 篇文章 0 订阅

订阅专栏

方法一：

已知 $A_{mn} * B_{np} = C_{mp}$

C i, j = \sum k = 1 n A i, k * B k, j

$C_{i,j} = \sum_{k=1}^{n}A_{i,k}*B_{k,j}$
Example:

C = ⎛ ⎝ ⎜ ⎜ ⎜ 147102581130912 ⎞ ⎠ ⎟ ⎟ ⎟ * ⎛ ⎝ ⎜ 100111529 ⎞ ⎠ ⎟ = ⎛ ⎝ ⎜ ⎜ ⎜ 43401692324670202280 ⎞ ⎠ ⎟ ⎟ ⎟

$\begin{equation} C= \left( \begin{array}{ccc} 1&2&3\\ 4&5&0\\ 7&8&9\\ 10&11&12\\ \end{array} \right) * \left( \begin{array}{ccc} 10&15\\ 0&2\\ 11&9 \end{array} \right) = \left( \begin{array}{c} 43&46\\ 40&70\\ 169&202\\ 232&280\\ \end{array} \right) \end{equation}$

如果直接模拟做的话可以为如下的形式：

for(int i=0;i<n;i++){
    for(int j=0;j<p;j++){
        for(int k = 0;k<n;k++){
            C[i][j] = C[i][j] + A[i][k]*B[k][j];
        }
    }
}

通过模拟矩阵的乘法可以发现 $A_{i,j}$ 会和 $B_{j,k} ,k\in[1,p]$ 都相乘一次，所以我们map阶段可以这样做。

将 $A$ 的元素变成 $(j,('A',i,A_{i,j}))$ 的形式 $A_{i,j} \ne 0$
将 $B$ 的元素变成 $(i,('A',j,A_{i,j}))$ 的形式 $B_{i,j} \ne 0$
Shuffle阶段将key相同的value放到一个列表当中
在reduce的时候将key相同的来自不同矩阵的value值，做笛卡尔积
再将其map成 $((i,j),value)$ 的形式
Shuffle阶段将key相同的value放到一个列表当中
在reduce的时候将key相同的value求和

Spark实现

#!/bin/python
#coding: utf-8
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
import numpy as np
#spark基本环境配置
conf=SparkConf().setMaster('local').setAppName('test')
sc=SparkContext(conf=conf)
hiveContext=HiveContext(sc)

matrix_1=sc.parallelize([(1,1,1),(1,2,1),(1,3,1),(2,1,1),(2,2,1),(2,3,1),(3,1,1),(3,2,1),(3,3,1)])
matrix_2=sc.parallelize([(1,1,1),(1,2,1),(1,3,1),(2,1,1),(2,2,1),(2,3,1),(3,1,1),(3,2,1),(3,3,1)])
temp_1=matrix_1.map(lambda x: (x[1],(x[0],x[2])))
temp_2=matrix_2.map(lambda x: (x[0],(x[1],x[2])))
temp=temp_1.cartesian(temp_2).filter(lambda x: x[0][0]==x[1][0])
ret=temp.map(lambda x: ((x[0][1][0],x[1][1][0]),x[0][1][1]*x[1][1][1])).reduceByKey(lambda x,y: x+y)
print temp.collect()
print ret.collect()

pyspark 官方文档：传送门

C++ Code

map

#include <iostream>
#include <string>
#include <vector>
#include <cstdio>
using namespace std;

void map()
{
    int x = 0;
    char tag;
    cin>>tag;
    getchar();
    string line;
    while(getline(cin,line)){
        vector<string> vc;
        string tmp = "";
        for(int i=0;i<line.length();i++){
            if(line[i]==' '){
                vc.push_back(tmp);
                tmp = "";
            }
            else
                tmp = tmp + line[i];
        }
        vc.push_back(tmp);
        for(int i=0;i<vc.size();i++){
            if(vc[i]=="0")
                continue;
            if(tag=='A')
                cout<<i<<" A "<<x<<" "<<vc[i]<<endl;
            else
                cout<<x<<" B "<<i<<" "<<vc[i]<<endl;
        }
        x++;
    }
}

int main()
{
    map();
    return 0;
}

reduce

#include <string>
#include <vector>
#include <cstdio>
#include <map>
#include <algorithm>
using namespace std;

typedef pair<int ,int > PII;

typedef map<PII,int> MPPII;

void reduce()
{
    string line;
    MPPII mpA,mpB,ans;
    while(getline(cin,line)){
        vector<int> vc;
        int tmp = 0;
        for(int i=0;i<line.length();i++){
            if(line[i]==' '){
                vc.push_back(tmp);
                tmp = 0;
            }
            else
                tmp = tmp*10 + line[i]-'0';
        }
        vc.push_back(tmp);
        if(vc[1]+'0'=='A')
            mpA[make_pair(vc[0],vc[2])]=vc[3];
        else
            mpB[make_pair(vc[0],vc[2])]=vc[3];
    }
    MPPII::iterator it1,it2;

    for(it1 = mpA.begin();it1!=mpA.end();it1++){
        for(it2 = mpB.begin();it2!=mpB.end();it2++){
            if(it1->first.first == it2->first.first)
                ans[make_pair(it1->first.second,it2->first.second)]+=(it1->second)*(it2->second);
        }
    }
    for(it1 = ans.begin();it1!=ans.end();it1++)
        cout<<(it1->first.first)<<" "<<(it1->first.second)<<" "<<it1->second<<endl;


}

int main()
{
    reduce();
    return 0;
}

run.sh

#!/bin/bash

if [ $# -ne 3 ];then
    echo "we need three args:matrixA,matrixB,outputfile"
    exit 3
fi

mapfile="$(pwd)/matrix_map.cpp"
mapExe="$(pwd)/map"
reducefile="$(pwd)/matrix_reduce.cpp"
reduceExe="$(pwd)/reduce"
res="$(pwd)/res"

echo "getting map.exe"
g++ -o ${mapExe} ${mapfile}
if [ $? -ne 0 ]; then
    exit 1
fi
echo "success!"

echo "getting reduce.exe"
g++ -o $reduceExe $reducefile
if [ $? -ne 0 ]; then
    exit 2
fi
echo "success!"

echo "mapping..."
cat $1 | $mapExe > $res
cat $2 | $mapExe >> $res
echo "success!"

echo "reducing..."
cat $res | $reduceExe > $3
echo "success!"

rm $res $mapExe $reduceExe

data

inputA:

inputB:

output

方法一的优点是：再大的矩阵也可以处理。缺点是：网络IO太大，速度慢。

方法二：

对于 $A×B$ ，如果 B <script type="math/tex" id="MathJax-Element-14">B</script>不是很大，可以把B放到分布式缓存上，把A按行切分发送给多个Mapper Task，各个Mapper Task把B完全放入内存中。