由于需要再hadoop上进行分词,而且需要加载自定义词典,因此需要在hadoop中读取字典文件。
但是在hadoop中如何在map处理数据的同时进行分词,如何读取到词典?
hadoop 提供了DistributedCache,其 是Map/Reduce框架提供的功能,能够缓存应用程序所需的文件 (包括文本,档案文件,jar文件等)hadoop中的MapReduce框架里已经预定义了相关的接口,其中如Mapper类下的方法setup()和cleanup()。
—-setup()
此方法被MapReduce框架仅且执行一次,在执行Map任务前,进行相关变量或者资源的集中初始化工作。
若是将资源初始化工作放在方法map()中,导致Mapper任务在解析每一行输入时都会进行资源初始化工作,导致重复,程序运行效率不高!
—-cleanup()
此方法被MapReduce框架仅且执行一次,在执行完毕Map任务后,进行相关变量或资源的释放工作。
若是将释放资源工作放入方法map()中,也会导 致Mapper任务在解析、处理每一行文本后释放资源,
而且在下一行文本解析前还要重复初始化,导致反复重复,程序运行效率不高!
所以,建议资源初始化及释放工作,分别放入方法setup()和cleanup()中进行。
根据上面的原理,因此在Mapper中加载词典的工作放在setup()中进行
本人的hadoop MapReduce程序分为三个文件:
//主函数
public class CacheMain {
private static void DistibuteCacheFile(Configuration conf,String path,String label) throws URISyntaxException{
Path filePath = new Path(path);
String uriWithLink = filePath.toUri().toString() + "#" + label;
System.out.println("uriWithLink:" + uriWithLink);
DistributedCache.addCacheFile(new URI(uriWithLink), conf);
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
DistributedCache.createSymlink(conf);
String pathSation = "/output/middle/station_label.txt";
String labelSation="station.txt";
DistibuteCacheFile(conf,pathSation,labelSation);
Job job = new Job(conf, "CacheDemo");
job.setJarByClass(CacheMain.class);
job.setMapperClass(CacheMapper.class);
job.setReducerClass(CacheReducer.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
Path outDir = new Path(otherArgs[1]);
FileSystem fstm = FileSystem.get(conf);
fstm.delete(outDir, true);
FileOutputFormat.setOutputPath(job, outDir);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
//Mapper
public class CacheMapper extends Mapper<Object, Text, Text, IntWritable> {
private static final Log LOG = LogFactory.getLog(CacheMapper.class);
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public static void UseDistributedCacheBySymbolicLink() throws Exception {
UserDefineLibrary.loadLibrary(UserDefineLibrary.FOREST, "station.txt");
}
protected void setup(Context context) throws IOException, InterruptedException {
LOG.info("Now, use the distributed cache and syslink");
try {
UseDistributedCacheBySymbolicLink();
} catch (Exception e) {
e.printStackTrace();
}
}
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] linelist;
String feedback;
String text;
try {
// get TV
List<Term> parse = DicAnalysis.parse(text).getTerms();// 分词
for (int i = 0; i < parse.size(); i++) {
if (parse.get(i).getNatureStr().contains("station")) {
StringBuilder sb = new StringBuilder();
sb.append(mac);
sb.append('\t').append("TV").append("\t").append("换台");
word.set(sb.toString());
context.write(word, one);
}
}
}
}
} catch (Exception e) {
}
}
}
//Reducer
public class CacheReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
参考:
http://dongxicheng.org/mapreduce-nextgen/hadoop-distributedcache-details/
http://blog.csdn.net/a_step_further/article/details/50333961
http://hpuxtbjvip0.blog.163.com/blog/static/3674131320132794940734/
http://www.cnblogs.com/quchunhui/articles/5460860.html