一、以下是测试数据:
"CITING","CITED"
3858241,956203
3858241,1324234
3858241,3398406
3858241,3557384
3858241,3634889
3858242,1515701
3858242,3319261
3858242,3668705
3858242,3707004
3858243,2949611
3858243,3146465
3858243,3156927
3858243,3221341
3858243,3574238
3858243,3681785
3858243,3684611
3858244,14040
3858244,17445
3858245,17445
注:第一列是专利号,第二列是引用的专利号。
二、Hadoop 代码如下:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class PatentCitations extends Configured implements Tool {
public static class PatentCitationsMapper extends Mapper<Text, Text, Text, Text> {
@Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, key);
}
}
public static class PatentCitationsReduces extends Reducer<Text, Text, Text, Text> {
private static Text staticVal = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value : values) {
if (sb.length() > 0) {
sb.append(",");
}
sb.append(value.toString());
}
staticVal.set(sb.toString());
context.write(key,staticVal);
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, ",");
Job job = new Job(getConf());
job.setJarByClass(getClass());
job.setJobName("patentcitations");
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(PatentCitationsMapper.class);
job.setReducerClass(PatentCitationsReduces.class);
FileInputFormat.setInputPaths(job, new Path("/patent/test/input/file1.txt"));
FileOutputFormat.setOutputPath(job, new Path("/patent/test/output"));
//FileInputFormat.setInputPaths(job, new Path(args[0]));
//FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0: 1;
}
public static void main(String[] args) throws Exception{
int result = ToolRunner.run(new PatentCitations(), args);
System.exit(result);
}
}
三、执行结果如下:
"CITED" "CITING"
1324234 3858241
14040 3858244
1515701 3858242
17445 3858245,3858244
2949611 3858243
3146465 3858243
3156927 3858243
3221341 3858243
3319261 3858242
3398406 3858241
3557384 3858241
3574238 3858243
3634889 3858241
3668705 3858242
3681785 3858243
3684611 3858243
3707004 3858242
956203 3858241
注:17445 分别被 3858245,3858244 所引用。