import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Progressable;
class Help {
static final boolean DEBUG = false;
public static void debug(Object o, String s) {
if (DEBUG) {
System.out.println(s + ":" + o.toString());
}
}
public static List<ArrayList<Double>> getOldCenters(String inputPath) {
List<ArrayList<Double>> result = new ArrayList<ArrayList<Double>>();
Configuration conf = new Configuration();
try {
FileSystem hdfs = FileSystem.get(conf);
Path inPath = new Path(inputPath);
FSDataInputStream fsIn = hdfs.open(inPath);
LineReader lineIn = new LineReader(fsIn, conf);
Text line = new Text();
while (lineIn.readLine(line) > 0) {
String record = line.toString();
String[] fields = record.split(",");
List<Double> tmpList = new ArrayList<Double>();
for (int i = 0; i < fields.length; i++)
tmpList.add(Double.parseDouble(fields[i]));
result.add((ArrayList<Double>) tmpList);
}
fsIn.close();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
public static void deleteLastResult(String path) {
Configuration conf = new Configuration();
try {
FileSystem hdfs = FileSystem.get(conf);
Path inPath = new Path(path);
hdfs.delete(inPath);
} catch (IOException e) {
}
}
public static void copyOriginalCenters(String src, String dst) {
Configuration conf = new Configuration();
try {
FileSystem hdfs = FileSystem.get(conf);
hdfs.copyFromLocalFile(new Path(src), new Path(dst));
} catch (IOException e) {
}
}
public static boolean isFinished(String oldPath, String newPath,
String KPath, String dtBegIdxPath, double threshold)
throws IOException {
int dataBeginIndex = Integer.parseInt(dtBegIdxPath);
int K = Integer.parseInt(KPath);
List<ArrayList<Double>> oldCenters = Help.getOldCenters(oldPath);
List<ArrayList<Double>> newCenters = new ArrayList<ArrayList<Double>>();
Configuration conf = new Configuration();
FileSystem hdfs = FileSystem.get(conf);
for (int t = 0; t < K; t++) {
Path inPath = new Path(newPath + t);
if (!hdfs.exists(inPath))
break;
FSDataInputStream fsIn = hdfs.open(inPath);
LineReader lineIn = new LineReader(fsIn, conf);
Text line = new Text();
while (lineIn.readLine(line) > 0) {
String tmp = line.toString();
Help.debug("tmp", tmp);
if(tmp.length()<5)//处理在集群上出现的key与value不在一行的情况
{
lineIn.readLine(line);
tmp = line.toString();
String []fields = tmp.split(",");
List<Double> tmpList = new ArrayList<Double>();
for (int i = 0; i < fields.length; i++)
tmpList.add(Double.parseDouble(fields[i]));
newCenters.add((ArrayList<Double>) tmpList);
continue;
}
String[] tmpLine = tmp.split(" ");
Help.debug(tmpLine[1].toString(), tmpLine.toString());
String record = tmpLine[1];
String[] fields = record.split(",");
List<Double> tmpList = new ArrayList<Double>();
for (int i = 0; i < fields.length; i++)
tmpList.add(Double.parseDouble(fields[i]));
newCenters.add((ArrayList<Double>) tmpList);
}
fsIn.close();
}
// System.out.println("oldCenter size:"+oldCenters.size()+"\nnewCenters size:"+newCenters.size());
double distance = 0;
for (int i = 0; i < K; i++) {
for (int j = dataBeginIndex; j < oldCenters.get(0).size(); j++) {
double t1 = Math.abs(oldCenters.get(i).get(j));
double t2 = Math.abs(newCenters.get(i).get(j));
distance += Math.pow((t1 - t2) / (t1 + t2), 2);
}
}
if (distance <= threshold) {
return true;
}
Help.deleteLastResult(oldPath);
FSDataOutputStream os = hdfs.create(new Path(oldPath));
for (int i = 0; i < newCenters.size(); i++) {
String text = "";
for (int j = 0; j < newCenters.get(i).size(); j++) {
if (j == 0)
text += newCenters.get(i).get(j);
else
text += "," + newCenters.get(i).get(j);
}
text += "\n";
os.write(text.getBytes(), 0, text.length());
}
os.close();
// ///
return false;
}
}
public class Kmeans {
// static List<ArrayList<Double>> centers ;
// static int K;
// static int dataBeginIndex;
public static class KmeansMapper extends
Mapper<Object, Text, IntWritable, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split(",");
List<ArrayList<Double>> centers = Help.getOldCenters(context
.getConfiguration().get("centersPath"));
int dataBeginIndex = Integer.parseInt(context.getConfiguration()
.get("dtBegIdxPath"));
int K = Integer.parseInt(context.getConfiguration().get("KPath"));
double minDistance = 99999999;
int centerIndex = K;
for (int i = 0; i < K; i++) {
double currentDistance = 0;
for (int j = dataBeginIndex; j < fields.length; j++) {
double t1 = Math.abs(centers.get(i).get(j));
double t2 = Math.abs(Double.parseDouble(fields[j]));
currentDistance += Math
MapReduce 运行Kmeans代码
最新推荐文章于 2023-10-31 14:15:39 发布