转自:
前一段时间,从配置hadoop到运行kmeans的mapreduce程序,着实让我纠结了几天,昨天终于把前面遇到的配置问题和程序运行问题搞定。Kmeans算法看起来很简单,但对于第一次接触mapreduce程序来说,还是有些挑战,还好基本都搞明白了。Kmeans算法是从网上下的在此分析一下过程。
Kmeans.java
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class KMeans {
- public static void main(String[] args) throws Exception
- {
- CenterInitial centerInitial = new CenterInitial();
- centerInitial.run(args);//初始化中心点
- int times=0;
- double s = 0,shold = 0.1;//shold是预制。
- do {
- Configuration conf = new Configuration();
- conf.set("fs.default.name", "hdfs://localhost:9000");
- Job job = new Job(conf,"KMeans");//建立KMeans的MapReduce作业
- job.setJarByClass(KMeans.class);//设定作业的启动类
- job.setOutputKeyClass(Text.class);//设定Key输出的格式:Text
- job.setOutputValueClass(Text.class);//设定value输出的格式:Text
- job.setMapperClass(KMapper.class);//设定Mapper类
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Text.class);//设定Reducer类
- job.setReducerClass(KReducer.class);
- FileSystem fs = FileSystem.get(conf);
- fs.delete(new Path(args[2]),true);//args[2]是output目录,fs.delete是将已存在的output删除
- //解析输入和输出参数,分别作为作业的输入和输出,都是文件
- FileInputFormat.addInputPath(job, new Path(args[0]));
- FileOutputFormat.setOutputPath(job, new Path(args[2]));
- //运行作业并判断是否完成成功
- job.waitForCompletion(true);
- if(job.waitForCompletion(true))//上一次mapreduce过程结束
- {
- //上两个中心点做比较,如果中心点之间的距离小于阈值就停止;如果距离大于阈值,就把最近的中心点作为新中心点
- NewCenter newCenter = new NewCenter();
- s = newCenter.run(args);
- times++;
- }
- } while(s > shold);//当误差小于阈值停止。
- System.out.println("Iterator: " + times);//迭代次数
- }
- }
hdfs://localhost:9000/home/administrator/hadoop/kmeans/input hdfs://localhost:9000/home/administrator/hadoop/kmeans hdfs://localhost:9000/home/administrator/hadoop/kmeans/output
代码的功能在程序中注释。
hadoop下的Kmeans算法实现二输入数据,保存在2.txt中:(1,1) (9,9) (2,3) (10,30) (4,4) (34,40) (5,6) (15,20)
3.txt用于保存临时的中心
part-r-00000用于保存reduce的结果
程序的mapreduce过程及结果:
- 初始化过程:(10,30) (2,3)
- 13/01/26 08:58:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
- 13/01/26 08:58:38 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
- 13/01/26 08:58:38 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
- 13/01/26 08:58:38 INFO input.FileInputFormat: Total input paths to process : 2
- 13/01/26 08:58:38 WARN snappy.LoadSnappy: Snappy native library not loaded
- 13/01/26 08:58:38 INFO mapred.JobClient: Running job: job_local_0001
- 13/01/26 08:58:39 INFO util.ProcessTree: setsid exited with exit code 0
- 13/01/26 08:58:39 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@15718f2
- 13/01/26 08:58:39 INFO mapred.MapTask: io.sort.mb = 100
- 13/01/26 08:58:39 INFO mapred.MapTask: data buffer = 79691776/99614720
- 13/01/26 08:58:39 INFO mapred.MapTask: record buffer = 262144/327680
- 0list:1
- 0c:10
- 1list:1
- 1c:30
- 中心点(2,3)对应坐标(1,1)
- Mapper输出:(2,3) (1,1)
- 0list:9
- 0c:10
- 1list:9
- 1c:30
- 中心点(2,3)对应坐标(9,9)
- Mapper输出:(2,3) (9,9)
- 0list:2
- 0c:10
- 1list:3
- 1c:30
- 中心点(2,3)对应坐标(2,3)
- Mapper输出:(2,3) (2,3)
- 0list:10
- 0c:10
- 1list:30
- 1c:30
- 中心点(10,30)对应坐标(10,30)
- Mapper输出:(10,30) (10,30)
- 0list:4
- 0c:10
- 1list:4
- 1c:30
- 中心点(2,3)对应坐标(4,4)
- Mapper输出:(2,3) (4,4)
- 0list:34
- 0c:10
- 1list:40
- 1c:30
- 中心点(10,30)对应坐标(34,40)
- Mapper输出:(10,30) (34,40)
- 0list:5
- 0c:10
- 1list:6
- 1c:30
- 中心点(2,3)对应坐标(5,6)
- Mapper输出:(2,3) (5,6)
- 0list:15
- 0c:10
- 1list:20
- 1c:30
- 中心点(10,30)对应坐标(15,20)
- Mapper输出:(10,30) (15,20)
- 13/01/26 08:58:39 INFO mapred.MapTask: Starting flush of map output
- 13/01/26 08:58:39 INFO mapred.MapTask: Finished spill 0
- 13/01/26 08:58:39 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
- 13/01/26 08:58:39 INFO mapred.JobClient: map 0% reduce 0%
- 13/01/26 08:58:42 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:42 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
- 13/01/26 08:58:42 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@77eaf8
- 13/01/26 08:58:42 INFO mapred.MapTask: io.sort.mb = 100
- 13/01/26 08:58:42 INFO mapred.MapTask: data buffer = 79691776/99614720
- 13/01/26 08:58:42 INFO mapred.MapTask: record buffer = 262144/327680
- 0list:2
- 0c:10
- 1list:3
- 1c:30
- 中心点(2,3)对应坐标(2,3)
- Mapper输出:(2,3) (2,3)
- 0list:10
- 0c:10
- 1list:30
- 1c:30
- 中心点(10,30)对应坐标(10,30)
- Mapper输出:(10,30) (10,30)
- 0list:34
- 0c:10
- 1list:40
- 1c:30
- 中心点(10,30)对应坐标(34,40)
- Mapper输出:(10,30) (34,40)
- 0list:1
- 0c:10
- 1list:1
- 1c:30
- 中心点(2,3)对应坐标(1,1)
- Mapper输出:(2,3) (1,1)
- 13/01/26 08:58:42 INFO mapred.MapTask: Starting flush of map output
- 13/01/26 08:58:42 INFO mapred.MapTask: Finished spill 0
- 13/01/26 08:58:42 INFO mapred.Task: Task:attempt_local_0001_m_000001_0 is done. And is in the process of commiting
- 13/01/26 08:58:42 INFO mapred.JobClient: map 100% reduce 0%
- 13/01/26 08:58:45 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:45 INFO mapred.Task: Task 'attempt_local_0001_m_000001_0' done.
- 13/01/26 08:58:45 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@18d7ace
- 13/01/26 08:58:45 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:45 INFO mapred.Merger: Merging 2 sorted segments
- 13/01/26 08:58:45 INFO mapred.Merger: Down to the last merge-pass, with 2 segments left of total size: 192 bytes
- 13/01/26 08:58:45 INFO mapred.LocalJobRunner:
- Reduce过程第一次
- (10,30)Reduce
- val:(10,30)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(34,40)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(10,30)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(34,40)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(15,20)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- count:5
- outVal:(10,30) (34,40) (10,30) (34,40) (15,20) /outVal
- ave0i103.0
- ave1i160.0
- 写入part:(10,30) (10,30) (34,40) (10,30) (34,40) (15,20) (20.6,32.0)
- Reduce过程第一次
- (2,3)Reduce
- val:(1,1)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(9,9)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(2,3)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(4,4)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(5,6)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(2,3)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- val:(1,1)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@141fab6
- temlength:2
- count:7
- outVal:(1,1) (9,9) (2,3) (4,4) (5,6) (2,3) (1,1) /outVal
- ave0i24.0
- ave1i27.0
- 写入part:(2,3) (1,1) (9,9) (2,3) (4,4) (5,6) (2,3) (1,1) (3.4285715,3.857143)
- 13/01/26 08:58:45 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
- 13/01/26 08:58:45 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:45 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now
- 13/01/26 08:58:45 INFO output.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to hdfs://localhost:9000/home/administrator/hadoop/kmeans/output
- 13/01/26 08:58:48 INFO mapred.LocalJobRunner: reduce > reduce
- 13/01/26 08:58:48 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
- 13/01/26 08:58:48 INFO mapred.JobClient: map 100% reduce 100%
- 13/01/26 08:58:48 INFO mapred.JobClient: Job complete: job_local_0001
- 13/01/26 08:58:48 INFO mapred.JobClient: Counters: 22
- 13/01/26 08:58:48 INFO mapred.JobClient: File Output Format Counters
- 13/01/26 08:58:48 INFO mapred.JobClient: Bytes Written=129
- 13/01/26 08:58:48 INFO mapred.JobClient: FileSystemCounters
- 13/01/26 08:58:48 INFO mapred.JobClient: FILE_BYTES_READ=1818
- 13/01/26 08:58:48 INFO mapred.JobClient: HDFS_BYTES_READ=450
- 13/01/26 08:58:48 INFO mapred.JobClient: FILE_BYTES_WRITTEN=122901
- 13/01/26 08:58:48 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=171
- 13/01/26 08:58:48 INFO mapred.JobClient: File Input Format Counters
- 13/01/26 08:58:48 INFO mapred.JobClient: Bytes Read=82
- 13/01/26 08:58:48 INFO mapred.JobClient: Map-Reduce Framework
- 13/01/26 08:58:48 INFO mapred.JobClient: Map output materialized bytes=200
- 13/01/26 08:58:48 INFO mapred.JobClient: Map input records=2
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce shuffle bytes=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Spilled Records=24
- 13/01/26 08:58:48 INFO mapred.JobClient: Map output bytes=164
- 13/01/26 08:58:48 INFO mapred.JobClient: Total committed heap usage (bytes)=498860032
- 13/01/26 08:58:48 INFO mapred.JobClient: CPU time spent (ms)=0
- 13/01/26 08:58:48 INFO mapred.JobClient: SPLIT_RAW_BYTES=262
- 13/01/26 08:58:48 INFO mapred.JobClient: Combine input records=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce input records=12
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce input groups=2
- 13/01/26 08:58:48 INFO mapred.JobClient: Combine output records=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Physical memory (bytes) snapshot=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce output records=2
- 13/01/26 08:58:48 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Map output records=12
- 13/01/26 08:58:48 INFO mapred.JobClient: Running job: job_local_0001
- 13/01/26 08:58:48 INFO mapred.JobClient: Job complete: job_local_0001
- 13/01/26 08:58:48 INFO mapred.JobClient: Counters: 22
- 13/01/26 08:58:48 INFO mapred.JobClient: File Output Format Counters
- 13/01/26 08:58:48 INFO mapred.JobClient: Bytes Written=129
- 13/01/26 08:58:48 INFO mapred.JobClient: FileSystemCounters
- 13/01/26 08:58:48 INFO mapred.JobClient: FILE_BYTES_READ=1818
- 13/01/26 08:58:48 INFO mapred.JobClient: HDFS_BYTES_READ=450
- 13/01/26 08:58:48 INFO mapred.JobClient: FILE_BYTES_WRITTEN=122901
- 13/01/26 08:58:48 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=171
- 13/01/26 08:58:48 INFO mapred.JobClient: File Input Format Counters
- 13/01/26 08:58:48 INFO mapred.JobClient: Bytes Read=82
- 13/01/26 08:58:48 INFO mapred.JobClient: Map-Reduce Framework
- 13/01/26 08:58:48 INFO mapred.JobClient: Map output materialized bytes=200
- 13/01/26 08:58:48 INFO mapred.JobClient: Map input records=2
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce shuffle bytes=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Spilled Records=24
- 13/01/26 08:58:48 INFO mapred.JobClient: Map output bytes=164
- 13/01/26 08:58:48 INFO mapred.JobClient: Total committed heap usage (bytes)=498860032
- 13/01/26 08:58:48 INFO mapred.JobClient: CPU time spent (ms)=0
- 13/01/26 08:58:48 INFO mapred.JobClient: SPLIT_RAW_BYTES=262
- 13/01/26 08:58:48 INFO mapred.JobClient: Combine input records=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce input records=12
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce input groups=2
- 13/01/26 08:58:48 INFO mapred.JobClient: Combine output records=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Physical memory (bytes) snapshot=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Reduce output records=2
- 13/01/26 08:58:48 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0
- 13/01/26 08:58:48 INFO mapred.JobClient: Map output records=12
- 上一次MapReduce结果:第一行:(10,30) (10,30) (34,40) (10,30) (34,40) (15,20) (20.6,32.0)
- 第二行:(2,3) (1,1) (9,9) (2,3) (4,4) (5,6) (2,3) (1,1) (3.4285715,3.857143)
- 。
- 0坐标距离:116.36001
- 1坐标距离:2.7755103
- 新中心点:(20.6,32.0) (3.4285715,3.857143)
- 13/01/26 08:58:49 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
- 13/01/26 08:58:49 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
- 13/01/26 08:58:49 INFO input.FileInputFormat: Total input paths to process : 2
- 13/01/26 08:58:49 INFO mapred.JobClient: Running job: job_local_0002
- 13/01/26 08:58:49 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@18aab40
- 13/01/26 08:58:49 INFO mapred.MapTask: io.sort.mb = 100
- 13/01/26 08:58:49 INFO mapred.MapTask: data buffer = 79691776/99614720
- 13/01/26 08:58:49 INFO mapred.MapTask: record buffer = 262144/327680
- 0list:1
- 0c:20.6
- 1list:1
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(1,1)
- Mapper输出:(3.4285715,3.857143) (1,1)
- 0list:9
- 0c:20.6
- 1list:9
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(9,9)
- Mapper输出:(3.4285715,3.857143) (9,9)
- 0list:2
- 0c:20.6
- 1list:3
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(2,3)
- Mapper输出:(3.4285715,3.857143) (2,3)
- 0list:10
- 0c:20.6
- 1list:30
- 1c:32.0
- 中心点(20.6,32.0)对应坐标(10,30)
- Mapper输出:(20.6,32.0) (10,30)
- 0list:4
- 0c:20.6
- 1list:4
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(4,4)
- Mapper输出:(3.4285715,3.857143) (4,4)
- 0list:34
- 0c:20.6
- 1list:40
- 1c:32.0
- 中心点(20.6,32.0)对应坐标(34,40)
- Mapper输出:(20.6,32.0) (34,40)
- 0list:5
- 0c:20.6
- 1list:6
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(5,6)
- Mapper输出:(3.4285715,3.857143) (5,6)
- 0list:15
- 0c:20.6
- 1list:20
- 1c:32.0
- 中心点(20.6,32.0)对应坐标(15,20)
- Mapper输出:(20.6,32.0) (15,20)
- 13/01/26 08:58:49 INFO mapred.MapTask: Starting flush of map output
- 13/01/26 08:58:49 INFO mapred.MapTask: Finished spill 0
- 13/01/26 08:58:49 INFO mapred.Task: Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting
- 13/01/26 08:58:50 INFO mapred.JobClient: map 0% reduce 0%
- 13/01/26 08:58:52 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:52 INFO mapred.Task: Task 'attempt_local_0002_m_000000_0' done.
- 13/01/26 08:58:52 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@147358f
- 13/01/26 08:58:52 INFO mapred.MapTask: io.sort.mb = 100
- 13/01/26 08:58:52 INFO mapred.MapTask: data buffer = 79691776/99614720
- 13/01/26 08:58:52 INFO mapred.MapTask: record buffer = 262144/327680
- 0list:2
- 0c:20.6
- 1list:3
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(2,3)
- Mapper输出:(3.4285715,3.857143) (2,3)
- 0list:10
- 0c:20.6
- 1list:30
- 1c:32.0
- 中心点(20.6,32.0)对应坐标(10,30)
- Mapper输出:(20.6,32.0) (10,30)
- 0list:34
- 0c:20.6
- 1list:40
- 1c:32.0
- 中心点(20.6,32.0)对应坐标(34,40)
- Mapper输出:(20.6,32.0) (34,40)
- 0list:1
- 0c:20.6
- 1list:1
- 1c:32.0
- 中心点(3.4285715,3.857143)对应坐标(1,1)
- Mapper输出:(3.4285715,3.857143) (1,1)
- 13/01/26 08:58:52 INFO mapred.MapTask: Starting flush of map output
- 13/01/26 08:58:52 INFO mapred.MapTask: Finished spill 0
- 13/01/26 08:58:52 INFO mapred.Task: Task:attempt_local_0002_m_000001_0 is done. And is in the process of commiting
- 13/01/26 08:58:53 INFO mapred.JobClient: map 100% reduce 0%
- 13/01/26 08:58:55 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:55 INFO mapred.Task: Task 'attempt_local_0002_m_000001_0' done.
- 13/01/26 08:58:55 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@2798e7
- 13/01/26 08:58:55 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:55 INFO mapred.Merger: Merging 2 sorted segments
- 13/01/26 08:58:55 INFO mapred.Merger: Down to the last merge-pass, with 2 segments left of total size: 317 bytes
- 13/01/26 08:58:55 INFO mapred.LocalJobRunner:
- Reduce过程第一次
- (20.6,32.0)Reduce
- val:(10,30)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(34,40)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(10,30)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(34,40)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(15,20)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- count:5
- outVal:(10,30) (34,40) (10,30) (34,40) (15,20) /outVal
- ave0i103.0
- ave1i160.0
- 写入part:(20.6,32.0) (10,30) (34,40) (10,30) (34,40) (15,20) (20.6,32.0)
- Reduce过程第一次
- (3.4285715,3.857143)Reduce
- val:(1,1)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(9,9)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(2,3)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(4,4)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(5,6)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(2,3)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- val:(1,1)
- values:org.apache.hadoop.mapreduce.ReduceContext$ValueIterable@13043d2
- temlength:2
- count:7
- outVal:(1,1) (9,9) (2,3) (4,4) (5,6) (2,3) (1,1) /outVal
- ave0i24.0
- ave1i27.0
- 写入part:(3.4285715,3.857143) (1,1) (9,9) (2,3) (4,4) (5,6) (2,3) (1,1) (3.4285715,3.857143)
- 13/01/26 08:58:55 INFO mapred.Task: Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting
- 13/01/26 08:58:55 INFO mapred.LocalJobRunner:
- 13/01/26 08:58:55 INFO mapred.Task: Task attempt_local_0002_r_000000_0 is allowed to commit now
- 13/01/26 08:58:55 INFO output.FileOutputCommitter: Saved output of task 'attempt_local_0002_r_000000_0' to hdfs://localhost:9000/home/administrator/hadoop/kmeans/output
- 13/01/26 08:58:58 INFO mapred.LocalJobRunner: reduce > reduce
- 13/01/26 08:58:58 INFO mapred.Task: Task 'attempt_local_0002_r_000000_0' done.
- 13/01/26 08:58:59 INFO mapred.JobClient: map 100% reduce 100%
- 13/01/26 08:58:59 INFO mapred.JobClient: Job complete: job_local_0002
- 13/01/26 08:58:59 INFO mapred.JobClient: Counters: 22
- 13/01/26 08:58:59 INFO mapred.JobClient: File Output Format Counters
- 13/01/26 08:58:59 INFO mapred.JobClient: Bytes Written=148
- 13/01/26 08:58:59 INFO mapred.JobClient: FileSystemCounters
- 13/01/26 08:58:59 INFO mapred.JobClient: FILE_BYTES_READ=4442
- 13/01/26 08:58:59 INFO mapred.JobClient: HDFS_BYTES_READ=1262
- 13/01/26 08:58:59 INFO mapred.JobClient: FILE_BYTES_WRITTEN=246235
- 13/01/26 08:58:59 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=676
- 13/01/26 08:58:59 INFO mapred.JobClient: File Input Format Counters
- 13/01/26 08:58:59 INFO mapred.JobClient: Bytes Read=82
- 13/01/26 08:58:59 INFO mapred.JobClient: Map-Reduce Framework
- 13/01/26 08:58:59 INFO mapred.JobClient: Map output materialized bytes=325
- 13/01/26 08:58:59 INFO mapred.JobClient: Map input records=2
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce shuffle bytes=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Spilled Records=24
- 13/01/26 08:58:59 INFO mapred.JobClient: Map output bytes=289
- 13/01/26 08:58:59 INFO mapred.JobClient: Total committed heap usage (bytes)=667418624
- 13/01/26 08:58:59 INFO mapred.JobClient: CPU time spent (ms)=0
- 13/01/26 08:58:59 INFO mapred.JobClient: SPLIT_RAW_BYTES=262
- 13/01/26 08:58:59 INFO mapred.JobClient: Combine input records=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce input records=12
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce input groups=2
- 13/01/26 08:58:59 INFO mapred.JobClient: Combine output records=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Physical memory (bytes) snapshot=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce output records=2
- 13/01/26 08:58:59 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Map output records=12
- 13/01/26 08:58:59 INFO mapred.JobClient: Running job: job_local_0002
- 13/01/26 08:58:59 INFO mapred.JobClient: Job complete: job_local_0002
- 13/01/26 08:58:59 INFO mapred.JobClient: Counters: 22
- 13/01/26 08:58:59 INFO mapred.JobClient: File Output Format Counters
- 13/01/26 08:58:59 INFO mapred.JobClient: Bytes Written=148
- 13/01/26 08:58:59 INFO mapred.JobClient: FileSystemCounters
- 13/01/26 08:58:59 INFO mapred.JobClient: FILE_BYTES_READ=4442
- 13/01/26 08:58:59 INFO mapred.JobClient: HDFS_BYTES_READ=1262
- 13/01/26 08:58:59 INFO mapred.JobClient: FILE_BYTES_WRITTEN=246235
- 13/01/26 08:58:59 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=676
- 13/01/26 08:58:59 INFO mapred.JobClient: File Input Format Counters
- 13/01/26 08:58:59 INFO mapred.JobClient: Bytes Read=82
- 13/01/26 08:58:59 INFO mapred.JobClient: Map-Reduce Framework
- 13/01/26 08:58:59 INFO mapred.JobClient: Map output materialized bytes=325
- 13/01/26 08:58:59 INFO mapred.JobClient: Map input records=2
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce shuffle bytes=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Spilled Records=24
- 13/01/26 08:58:59 INFO mapred.JobClient: Map output bytes=289
- 13/01/26 08:58:59 INFO mapred.JobClient: Total committed heap usage (bytes)=667418624
- 13/01/26 08:58:59 INFO mapred.JobClient: CPU time spent (ms)=0
- 13/01/26 08:58:59 INFO mapred.JobClient: SPLIT_RAW_BYTES=262
- 13/01/26 08:58:59 INFO mapred.JobClient: Combine input records=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce input records=12
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce input groups=2
- 13/01/26 08:58:59 INFO mapred.JobClient: Combine output records=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Physical memory (bytes) snapshot=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Reduce output records=2
- 13/01/26 08:58:59 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0
- 13/01/26 08:58:59 INFO mapred.JobClient: Map output records=12
- 上一次MapReduce结果:第一行:(20.6,32.0) (10,30) (34,40) (10,30) (34,40) (15,20) (20.6,32.0)
- 第二行:(3.4285715,3.857143) (1,1) (9,9) (2,3) (4,4) (5,6) (2,3) (1,1) (3.4285715,3.857143)
- 。
- 0坐标距离:0.0
- 1坐标距离:0.0
- 新中心点:(20.6,32.0) (3.4285715,3.857143)
- Iterator: 2
初始化中心点CenterInitial.java
- import java.io.ByteArrayInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.OutputStream;
- import java.net.URI;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FSDataInputStream;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IOUtils;
- public class CenterInitial {
- public void run(String[] args) throws IOException
- {
- String[] clist;//用于保存中心点
- int k = 2;//中心点选取个数
- String string = "";//保存各个中心点在同一个字符串string中
- String inpath = args[0]+"/4.txt"; //cluster数据集放在2.txt中
- String outpath = args[1]+"/input2/3.txt"; //center新选取的中心点放进3.txt中保存
- Configuration conf1 = new Configuration(); //读取hadoop文件系统的配置
- conf1.set("hadoop.job.ugi", "hadoop,hadoop"); //配置信息设置
- FileSystem fs = FileSystem.get(URI.create(inpath),conf1); //FileSystem是用户操作HDFS的核心类,它获得URI对应的HDFS文件系统
- FSDataInputStream in = null;
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- try{
- in = fs.open( new Path(inpath) );
- IOUtils.copyBytes(in,out,50,false); //用Hadoop的IOUtils工具方法来让这个文件的指定字节复制到标准输出流上
- //把in读到的数据 复制到out上
- clist = out.toString().split(" ");//将out以空格为分割符转换成数组在clist中保存
- } finally {
- IOUtils.closeStream(in);
- }
- FileSystem filesystem = FileSystem.get(URI.create(outpath), conf1); //获得URI对应的HDFS文件系统
- for(int i=0;i<k;i++)
- {
- int j=(int) (Math.random()*100) % clist.length;//选取0到clist.lenth-1的随机数
- if(string.contains(clist[j])) // 如果选取的是同一个随机数
- {
- k++;
- continue;
- }
- string = string + clist[j].replace(" ", "") + " ";//将得到的k个随机点的坐标用一个字符串保存
- }
- OutputStream out2 = filesystem.create(new Path(outpath) );
- IOUtils.copyBytes(new ByteArrayInputStream(string.getBytes()), out2, 4096,true); //把随机点坐标字符串out2中
- System.out.println("初始化过程:"+string);
- }
- }
KMapper.java
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.net.URI;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FSDataInputStream;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IOUtils;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- public class KMapper extends Mapper<LongWritable, Text, Text, Text> {
- private String[] center;
- //读取3.txt中更新的中心点坐标,并将坐标存入center数组中
- protected void setup(Context context) throws IOException,InterruptedException //read centerlist, and save to center[]
- {
- String centerlist = "hdfs://localhost:9000/home/administrator/hadoop/kmeans/input2/3.txt"; //center文件
- Configuration conf1 = new Configuration();
- conf1.set("hadoop.job.ugi", "hadoop-user,hadoop-user");
- FileSystem fs = FileSystem.get(URI.create(centerlist),conf1);
- FSDataInputStream in = null;
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- try{
- in = fs.open( new Path(centerlist) );
- IOUtils.copyBytes(in,out,100,false);
- center = out.toString().split(" ");
- }finally{
- IOUtils.closeStream(in);
- }
- }
- //从hadoop接收的数据在2.txt中保存
- public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
- {
- StringTokenizer itr = new StringTokenizer(value.toString());
- //从2.txt读入数据,以空格为分割符,一个一个处理
- while(itr.hasMoreTokens())//用于判断所要分析的字符串中,是否还有语言符号,如果有则返回true,反之返回false
- {
- //计算第一个坐标跟第一个中心的距离min
- String outValue = new String(itr.nextToken());//逐个获取以空格为分割符的字符串(2,3) (10,30) (34,40) (1,1)
- String[] list = outValue.replace("(", "").replace(")", "").split(",");
- String[] c = center[0].replace("(", "").replace(")", "").split(",");
- float min = 0;
- int pos = 0;
- for(int i=0;i<list.length;i++)
- {
- System.out.println(i+"list:"+list[i]);
- System.out.println(i+"c:"+c[i]);
- min += (float) Math.pow((Float.parseFloat(list[i]) - Float.parseFloat(c[i])),2);//求欧式距离,为加根号
- }
- for(int i=0;i<center.length;i++)
- {
- String[] centerStrings = center[i].replace("(", "").replace(")", "").split(",");
- float distance = 0;
- for(int j=0;j<list.length;j++)
- distance += (float) Math.pow((Float.parseFloat(list[j]) - Float.parseFloat(centerStrings[j])),2);
- if(min>distance)
- {
- min=distance;
- pos=i;
- }
- }
- context.write(new Text(center[pos]), new Text(outValue));//输出:中心点,对应的坐标
- System.out.println("中心点"+center[pos]+"对应坐标"+outValue);
- System.out.println("Mapper输出:"+center[pos]+" "+outValue);
- }
- }
- }
KReduce.java
- import java.io.IOException;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class KReducer extends Reducer<Text, Text, Text, Text> {
- //<中心点类别,中心点对应的坐标集合>,每个中心点类别的坐标集合求新的中心点
- public void reduce(Text key,Iterable<Text> value,Context context) throws IOException,InterruptedException
- {
- String outVal = "";
- int count=0;
- String center="";
- System.out.println("Reduce过程第一次");
- System.out.println(key.toString()+"Reduce");
- int length = key.toString().replace("(", "").replace(")", "").replace(":", "").split(",").length;
- float[] ave = new float[Float.SIZE*length];
- for(int i=0;i<length;i++)
- ave[i]=0;
- for(Text val:value)
- {
- System.out.println("val:"+val.toString());
- System.out.println("values:"+value.toString());
- outVal += val.toString()+" ";
- String[] tmp = val.toString().replace("(", "").replace(")", "").split(",");
- System.out.println("temlength:"+tmp.length);
- for(int i=0;i<tmp.length;i++)
- ave[i] += Float.parseFloat(tmp[i]);
- count ++;
- }
- System.out.println("count:"+count);
- System.out.println("outVal:"+outVal+"/outVal");
- for (int i=0;i<2;i++)
- {
- System.out.println("ave"+i+"i"+ave[i]);
- }
- //ave[0]存储X坐标之和,ave[1]存储Y坐标之和
- for(int i=0;i<length;i++)
- {
- ave[i]=ave[i]/count;
- if(i==0)
- center += "("+ave[i]+",";
- else {
- if(i==length-1)
- center += ave[i]+")";
- else {
- center += ave[i]+",";
- }
- }
- }
- System.out.println("写入part:"+key+" "+outVal+" "+center);
- context.write(key, new Text(outVal+center));
- }
- }
NewCenter.java
- import java.io.ByteArrayInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.OutputStream;
- import java.net.URI;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FSDataInputStream;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IOUtils;
- public class NewCenter {
- int k = 2;
- float shold=Integer.MIN_VALUE;
- String[] line;
- String newcenter = new String("");
- public float run(String[] args) throws IOException,InterruptedException
- {
- Configuration conf = new Configuration();
- conf.set("hadoop.job.ugi", "hadoop,hadoop");
- FileSystem fs = FileSystem.get(URI.create(args[2]+"/part-r-00000"),conf);
- FSDataInputStream in = null;
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- try{
- in = fs.open( new Path(args[2]+"/part-r-00000"));
- IOUtils.copyBytes(in,out,50,false);
- line = out.toString().split("\n");
- } finally {
- IOUtils.closeStream(in);
- }
- //System.out.println("上一次的MapReduce结果:"+out.toString());
- System.out.println("上一次MapReduce结果:第一行:"+line[0]);
- System.out.println("第二行:"+line[1]);
- System.out.println("。");
- for(int i=0;i<k;i++)
- {
- String[] l = line[i].replace("\t", " ").split(" ");//如果这行有tab的空格,可以替代为空格
- //(key,values)key和values同时输出是,中间保留一个Tab的距离,即'\t'
- String[] startCenter = l[0].replace("(", "").replace(")", "").split(",");
- //上上次的中心点startCenter[0]=(10,30);startCenter[1]=(2,3);
- String[] finalCenter = l[l.length-1].replace("(", "").replace(")", "").split(",");
- //上一次的中心点finalCenter[0]=(22,35);finalCenter[1]=(1.5,2.0);
- float tmp = 0;
- for(int j=0;j<startCenter.length;j++)
- tmp += Math.pow(Float.parseFloat(startCenter[j])-Float.parseFloat(finalCenter[j]), 2);
- //两个中心点间的欧式距离的平方
- newcenter = newcenter + l[l.length - 1].replace("\t", "") + " ";
- if(shold <= tmp)
- shold = tmp;
- System.out.println(i+"坐标距离:"+tmp);
- }
- System.out.println("新中心点:"+newcenter);
- OutputStream out2 = fs.create(new Path(args[1]+"/center/3.txt") );
- IOUtils.copyBytes(new ByteArrayInputStream(newcenter.getBytes()), out2, 4096,true);
- //System.out.println(newcenter);
- return shold;
- //return 0;
- }
- }