对一些有指定分隔符的数据,按照对应列数进行自定义排序
原始数据:
hadoop@sh-hadoop:more sourText.txt
hadoop|234|2346|sdfasdgadfgdfg
spark|534|65745|fhsdfghdfgh
hive|65|6585|shsfghfgh
hbase|98|456|jhgjdfghj
tachyon|345|567|sfhrtyhert
kafka|455|567|dghrtyh
storm|86|345|dgsdfg
redis|45|56|ergerg
sqoop|45|765|fghd
flume|34|67|sdfgrty
oozie|23|45|adfgdfg
pig|54|456|dfg
zookeeper|23|543|dfgd
solr|75|54|ertgergt
1、用Mr进行排序,按照第2列进行降序排序:
hadoop@sh-hadoop:/home/hadoop/blb$ hdfs dfs -text /user/hadoop/libin/input/sourText.txt | wc -l
14
hadoop@sh-hadoop:/home/hadoop/blb$ hdfs dfs -text /user/hadoop/libin/Domain800_level2/merge1/out1/* | wc -l
14
hadoop@sh-hadoop:/home/hadoop/blb$ hdfs dfs -text /user/hadoop/libin/Domain800_level2/merge1/out1/* | more
spark|534|65745|fhsdfghdfgh
kafka|455|567|dghrtyh
tachyon|345|567|sfhrtyhert
hadoop|234|2346|sdfasdgadfgdfg
hbase|98|456|jhgjdfghj
storm|86|345|dgsdfg
solr|75|54|ertgergt
hive|65|6585|shsfghfgh
pig|54|456|dfg
redis|45|56|ergerg
sqoop|45|765|fghd
flume|34|67|sdfgrty
oozie|23|45|adfgdfg
zookeeper|23|543|dfgd
hadoop@sh-hadoop:/home/hadoop/blb$
2、用shell命令进行统计:
-r:sort默认的排序方式是升序,如果想改成降序,加个-r就搞定了。
-n:就要使用-n选项,来告诉sort,“要以数值来排序”!
-t:sort提供了-t选项,后面可以设定间隔符。
-k:指定了间隔符之后,就可以用-k来指定列数了。
2.1、按照第二列进行降序排序:
sort -t "|" -nrk2 sourText.txt
hadoop@sh-hadoop:/home/hadoop/blb$ sort -t "|" -nrk2 sourText.txt
spark|534|65745|fhsdfghdfgh
kafka|455|567|dghrtyh
tachyon|345|567|sfhrtyhert
hadoop|234|2346|sdfasdgadfgdfg
hbase|98|456|jhgjdfghj
storm|86|345|dgsdfg
solr|75|54|ertgergt
hive|65|6585|shsfghfgh
pig|54|456|dfg
sqoop|45|765|fghd
redis|45|56|ergerg
flume|34|67|sdfgrty
zookeeper|23|543|dfgd
oozie|23|45|adfgdfg
2.2、按照第三列进行降序排序:
hadoop@sh-hadoop:/home/hadoop/blb$ sort -t "|" -nrk3 sourText.txt
spark|534|65745|fhsdfghdfgh
hive|65|6585|shsfghfgh
hadoop|234|2346|sdfasdgadfgdfg
sqoop|45|765|fghd
tachyon|345|567|sfhrtyhert
kafka|455|567|dghrtyh
zookeeper|23|543|dfgd
pig|54|456|dfg
hbase|98|456|jhgjdfghj
storm|86|345|dgsdfg
flume|34|67|sdfgrty
redis|45|56|ergerg
solr|75|54|ertgergt
oozie|23|45|adfgdfg
排序后倒入新文件中:
sort -t "|" -nrk2 part-r-00000 |more > merge.txt
附录:
MapReduce实现代码:
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import mapreduce.SegmentUtil;
public class Domain_merge {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage Domain800_level2 <input> <输出结果>");
System.exit(2);
}
Job job4 = Job.getInstance(conf, Domain_merge.class.getSimpleName());
job4.setJarByClass(Domain_merge.class);
job4.setMapOutputKeyClass(Toptaobao500.class);
job4.setMapOutputValueClass(Text.class);
job4.setOutputKeyClass(Text.class);
job4.setOutputValueClass(NullWritable.class);
//job4.setPartitionerClass(MyPartitioner.class);
job4.setMapperClass(MyMapper2.class);
job4.setNumReduceTasks(1);
job4.setReducerClass(MyReducer2.class);
job4.setInputFormatClass(TextInputFormat.class);
job4.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job4, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job4, new Path(otherArgs[1]));
job4.waitForCompletion(true);
}
/**
* 第二个Job排序
*/
public static class MyMapper2 extends Mapper<LongWritable, Text, Toptaobao500, Text>{
Toptaobao500 mw=new Toptaobao500();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Toptaobao500, Text>.Context context)
throws IOException, InterruptedException {
String[] spl=value.toString().split("\\|");
String trait=spl[0].trim();
String uv=spl[1].trim();
String pv=spl[2].trim();
String fenlei=spl[3].trim();
mw.setkind(trait+"|"+uv+"|"+pv+"|"+fenlei);
mw.setCount(Long.parseLong(uv.trim()));
context.write(mw, new Text(value));
}
}
public static class MyReducer2 extends Reducer<Toptaobao500, Text, Text, NullWritable>{
@Override
protected void reduce(Toptaobao500 k4, Iterable<Text> v4s, Reducer<Toptaobao500, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
for (Text v4 : v4s) {
context.write(v4, NullWritable.get());
}
}
}
public static class Toptaobao500 implements WritableComparable<Toptaobao500> {
String kind;
Long count;
public Toptaobao500() {
}
public Toptaobao500(String kind, Long count) {
this.kind = kind;
this.count = count;
}
public void setkind(String kind) {
this.kind = kind;
}
public void setCount(Long l) {
this.count = l;
}
public String getKind() {
return this.kind;
}
public Long getCount() {
return this.count;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(kind);
out.writeLong(count);
}
@Override
public void readFields(DataInput in) throws IOException {
this.kind = in.readUTF();
this.count = in.readLong();
}
@Override
public int compareTo(Toptaobao500 o) {
long temp=this.count-o.count;
if(temp>0){
temp=-1;
return (int) temp;
}else if(temp<0){
temp=1;
return (int) temp;
}
return (int) (this.count-o.count);
}
@Override
public boolean equals(Object obj) {
return super.equals(obj);
}
@Override
public int hashCode() {
return super.hashCode();
}
@Override
public String toString() {
return this.kind;
}
}
}