使用MapReduce实现pairs算法实现单词的共现矩阵

最新推荐文章于 2022-06-23 21:05:56 发布

红豆和绿豆

最新推荐文章于 2022-06-23 21:05:56 发布

阅读量3.2k

点赞数 2

分类专栏： hadoop 文章标签： mapreduce 词频共现矩阵 pairs算法

本文链接：https://blog.csdn.net/u011955252/article/details/50532950

版权

hadoop 专栏收录该内容

93 篇文章 2 订阅

订阅专栏

词频共现矩阵的用途很广泛，个性化的推荐系统，基于物品的协同过滤等等。

什么叫做共现矩阵

例如： I am a good boy good boy

I am a good boy

I 1

am 1

a 1

good 2

boy 2

就是二个单词一起出现的次数在一篇文档中。

如何用MapReduce实现这个功能呢

1，我们使用pairs算法，设定一个窗口，将窗口的第一个元素与窗口后面的元素一次形成一个队《（I ,am）,1》

2,我们需要重写FileInputFormat将一个文件作为整体不允许分割key为文件名，value为内容的bytes

3，我们需要自定义key值得类型是一个二个对（word1，word2）是一个key，因此我们需要extends WritableComparable<WordPair> 并实现

equals（比较二个类是否相等） hashCode（将相同的key值不会因为顺序，分到不同的RedUCe上）compareTo 比较二个类的大小 readFiles（） writeFileds（）序列化

下面是具体的代码:

package WordConCurrence;

import java.io.DataInput;

public class WordPair implements WritableComparable<WordPair> {
private String wordA;
private String wordB;

public WordPair() {
}

public WordPair(String wordA, String wordB) {
this.wordA = wordA;
this.wordB = wordB;
}

public String getWordA() {
return this.wordA;
}

public String getWordB() {
return this.wordB;
}

@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(wordA);
out.writeUTF(wordB);
}

@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
wordA = in.readUTF();
wordB = in.readUTF();
}

@Override
public String toString() {
return wordA + "," + wordB;
}

@Override
public int compareTo(WordPair o) {
if (this.equals(o))
return 0;
else
return (wordA + wordB).compareTo(o.getWordA() + o.getWordB());
}

@Override
public boolean equals(Object o) {
// 无序对，不用考虑顺序
if (!(o instanceof WordPair))
return false;
WordPair w = (WordPair) o;
if ((this.wordA.equals(w.wordA) && this.wordB.equals(w.wordB))
|| (this.wordB.equals(w.wordA) && this.wordA.equals(w.wordB)))
return true;
return false;
}

@Override
public int hashCode() {
return (wordA.hashCode() + wordB.hashCode()) * 17;
}
}

package WordConCurrence;

import java.io.IOException;

/*
*重写FileInputFormat，将文件不分割，读入到一个map
* */
public class WholeFileInputFormat extends FileInputFormat<Text, BytesWritable> {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
// TODO Auto-generated method stub
return false;
}

@Override
public RecordReader<Text, BytesWritable> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
// return null;
return new SingleFileNameReader((FileSplit) split, context
.getConfiguration());
}
}

package WordConCurrence;

import java.io.IOException;

public class SingleFileNameReader extends RecordReader<Text, BytesWritable> {

private FileSplit fileSplit;
@SuppressWarnings("unused")
private Configuration conf;
private boolean processed = false;
private Text key = null;
private BytesWritable value = null;
private FSDataInputStream fis = null;

public SingleFileNameReader(FileSplit fileSplit, Configuration conf) {
this.fileSplit = fileSplit;
this.conf = conf;
}

@Override
public void close() throws IOException {
// TODO Auto-generated method stub

}

@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return processed ? 1.0f : 0.0f;
}

@Override
public Text getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return key;
}

@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
// TODO Auto-generated method stub
return value;
}

/*
* @Override public void initialize(InputSplit arg0, TaskAttemptContext
* arg1) throws IOException, InterruptedException { fileSplit =
* (FileSplit)arg0; Configuration job = arg1.getConfiguration(); Path file =
* fileSplit.getPath(); FileSystem fs = file.getFileSystem(job); fis =
* fs.open(file); }
*/

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new Text();
}
if (value == null) {
value = new BytesWritable();
}
if (!processed) {
byte[] content = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
System.out.println(file.getName());
key.set(file.getName());
try {
IOUtils.readFully(fis, content, 0, content.length);
value.set(new BytesWritable(content));
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeStream(fis);
}
processed = true;
return true;// return true表示这次inputformat还没有结束，会有下一对keyvalue产生
}
return false;// return false表示这次inputformat结束了
}

@Override
public void initialize(InputSplit split,
org.apache.hadoop.mapreduce.TaskAttemptContext context)
throws IOException, InterruptedException {
fileSplit = (FileSplit) split;
Configuration job = context.getConfiguration();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(job);
fis = fs.open(file);

}
}

package WordConCurrence;

import java.io.IOException;

/**
*统计在若干篇文档中两个英文单词在一定窗口内同时出现的次数
* 如何计算二个单词出现的频率,使用pairs算法,该算法的流程就是:
* 选择一个窗口的大小,使用队列,将队列的第一个值与后面的值分别成为一个
* e,of 1
* we,on 1 we,said 1 we,should 2 we,stay 1 we,that 1 we,the 2 we,them 1 we,us 1
* we,which 1 which,Junk 1 which,a 1 which,assures 1 which,food 1 which,is 1
* which,necessary 1 which,nutritions 1 which,the 1 which,us 1 who,at 1 who,ate
* 1 who,enjoy 1 who,main 1 who,meal 1 who,midday 1 who,now 1 who,their 1
* who,traditionally
*/
public class WordConcurrnce {
private static int MAX_WINDOW = 20;// 单词同现的最大窗口大小
private static String wordRegex = "([a-zA-Z]{1,})";// 仅仅匹配由字母组成的简单英文单词
private static Pattern wordPattern = Pattern.compile(wordRegex);// 用于识别英语单词(带连字符-)
private static IntWritable one = new IntWritable(1);

public static class WordConcurrenceMapper extends
Mapper<Text, BytesWritable, WordPair, IntWritable> {
private int windowSize;
private Queue<String> windowQueue = new LinkedList<String>();

@Override
protected void setup(Context context) throws IOException,
InterruptedException {
windowSize = Math.min(context.getConfiguration()
.getInt("window", 2), MAX_WINDOW);
}

/**
* 输入键位文档的文件名，值为文档中的内容的字节形式。
*
*/
@Override
public void map(Text docName, BytesWritable docContent, Context context)
throws IOException, InterruptedException {
Matcher matcher = wordPattern.matcher(new String(docContent
.getBytes(), "UTF-8"));
while (matcher.find()) {
windowQueue.add(matcher.group());
if (windowQueue.size() >= windowSize) {
// 对于队列中的元素[q1,q2,q3...qn]发射[(q1,q2),1],[(q1,q3),1],
// ...[(q1,qn),1]出去
Iterator<String> it = windowQueue.iterator();
String w1 = it.next();
while (it.hasNext()) {
String next = it.next();
context.write(new WordPair(w1, next), one);
}
windowQueue.remove();
}
}
while (!(windowQueue.size() <= 1)) {
Iterator<String> it = windowQueue.iterator();
String w1 = it.next();
while (it.hasNext()) {
context.write(new WordPair(w1, it.next()), one);
}
windowQueue.remove();
}
}

}

public static class WordConcurrenceReducer extends
Reducer<WordPair, IntWritable, WordPair, IntWritable> {
@Override
public void reduce(WordPair wordPair, Iterable<IntWritable> frequence,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : frequence) {
sum += val.get();
}
context.write(wordPair, new IntWritable(sum));
}
}

public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Job wordConcurrenceJob = new Job();
wordConcurrenceJob.setJobName("wordConcurrenceJob");
wordConcurrenceJob.setJarByClass(WordConcurrnce.class);
wordConcurrenceJob.getConfiguration().setInt("window",
Integer.parseInt(args[2]));

wordConcurrenceJob.setMapperClass(WordConcurrenceMapper.class);
wordConcurrenceJob.setMapOutputKeyClass(WordPair.class);
wordConcurrenceJob.setMapOutputValueClass(IntWritable.class);

wordConcurrenceJob.setReducerClass(WordConcurrenceReducer.class);
wordConcurrenceJob.setOutputKeyClass(WordPair.class);
wordConcurrenceJob.setOutputValueClass(IntWritable.class);

wordConcurrenceJob.setInputFormatClass(WholeFileInputFormat.class);
wordConcurrenceJob.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(wordConcurrenceJob, new Path(args[0]));
FileOutputFormat.setOutputPath(wordConcurrenceJob, new Path(args[1]));

wordConcurrenceJob.waitForCompletion(true);
System.out.println("finished!");
}
}

红豆和绿豆

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
使用MapReduce实现pairs算法实现单词的共现矩阵

词频共现矩阵的用途很广泛，个性化的推荐系统，基于物品的协同过滤等等。什么叫做共现矩阵例如： I am a good boy good boy I am a good boyI 1am 1a 1good 2boy 2就是二个单词一起出现的次
复制链接

扫一扫