NLPIR+Hadoop

最新推荐文章于 2021-12-13 21:12:18 发布

monkey131499

最新推荐文章于 2021-12-13 21:12:18 发布

阅读量1.2k

点赞数 1

本文链接：https://blog.csdn.net/monkey131499/article/details/53081572

版权

Java 同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

Hadoop

3 篇文章 0 订阅

订阅专栏

最近在学习过程中，需要对文本进行分词，而且数据量比较大，在 Windows上使用NLPIR处理小文件基本上没有问题（可以看这里），看NLPIR的开发文档是支持分布式的，因而考虑在Linux上实现hadoop+NLPIR对大量文本数据进行分词和标注。这个过程让我经历了焦头烂额，所以记录下来，便于自己查看，也可以帮助有需要的伙伴~

1.下载NLPIR

NLPIR原名 ICTCLAS，下载点这里，我下载的是2016-10-9发布的NLPIR2016，同时支持Windows和Linux，且有Java/C/C++/C# 多种语言，我这里使用的是Java语言。项目中需要两个文件：一是Data文件夹下的所有内容，二是libNLPIR.so文件

2.Hadoop+NLPIR 代码

NLPIR配置

package com.katoa.segment;

import com.katoa.util.CLibrary;
import com.sun.jna.Native;

public class NLPIR {

	CLibrary Instance = (CLibrary) Native.loadLibrary("<span style="color:#FF0000;">/usr/local/workspace/NLPIR2015/libNLPIR.so</span>", CLibrary.class);

	private boolean initFlag = false;

	public boolean init() {
		System.out.println("jna.library.path");
		String argu = "/usr/local/workspace/NLPIR2015/";
		// String system_charset = "GBK";//GBK----0
		int charset_type = 1; //UTF-8

		int init_flag = Instance.NLPIR_Init(argu, charset_type, "0");
		String nativeBytes = null;

		if (0 == init_flag) {
			nativeBytes = Instance.NLPIR_GetLastErrorMsg();
			System.err.println("初始化失败！fail reason is " + nativeBytes);
			return false;
		}
initFlag = true;
		Instance.NLPIR_SetPOSmap(1); // 计算所一级标注
		return true;
	}
	public boolean unInit() {
		try {
			Instance.NLPIR_Exit();
		} catch (Exception e) {
			System.out.println(e);
			return false;
		}
		initFlag = false;
		return true;
	}
	public CLibrary getInstance() {
		return Instance;
	}
	public boolean isInitFlag() {
		return initFlag;
	}
	
        /**
	 * 分词，1表示标注，0表示不标注
	 * 
	 * @param context
	 *            字符串
	 * @param bPOSTagged
	 *            是否标注
	 * @return String
	 */
	public String segment(String context, int bPOSTagged) {
		// 分词和标注处理，1表示标注，0表示不标注
		String result = Instance.NLPIR_ParagraphProcess(context, bPOSTagged);
		return result;
	}

	/**
	 * 分词
	 * 
	 * @param context
	 *            字符串
	 * @param reduce
	 *            是否抽取
	 * @return String
	 */
	public String segment(String context, boolean reduce) {
		// 分词和标注处理，1表示标注，0表示不标注
		String result = Instance.NLPIR_ParagraphProcess(context, 1);
		String line = "";
		if (reduce) {
			String[] words = result.split(" ");
			for (String word : words) {
				word = word.replaceAll(" ", "");
				int index = word.lastIndexOf("/");
				if (word.substring(index + 1).equals("n") || word.substring(index + 1).equals("v")
						|| word.substring(index + 1).equals("a") || word.substring(index + 1).equals("ad")
						|| word.substring(index + 1).equals("d") || word.substring(index + 1).equals("o")
						|| word.substring(index + 1).equals("other") || word.substring(index + 1).equals("xm")) {

					if (!word.substring(0, index).equals("")) {
						line = line + word.substring(0, index) + " ";
					}
				}
			}
		}
		return line;
	}
}

Map 端

package com.katoa.tuple;

/**
 * @author 作者 : monkey
 * @version 创建时间：2016年10月9日 下午8:12:15
 *          类说明:语料预处理 mapper
 */
import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import com.katoa.segment.NLPIR;

public class FilterMapper extends Mapper<LongWritable, Text, Text, Text> {
	private static NLPIR nlpir = new NLPIR();

	<span style="color:#FF0000;">protected void setup(Context context) {
		nlpir.init();
	}</span>

	@Override
	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		ArrayList<String> phraselist = new ArrayList<String>();
	
		String id = null;
		String v = null;

		String temp = value.toString();

		String[] contents = temp.split("\t");
		id= contents[0];
		v= contents[1];
		
		if (v!= null) {
		        String str = posTagging(v);

			context.write(new Text(id), new Text(str));
			}

		}
	}

	<span style="color:#FF0000;">protected void cleanup(Context context) {
		nlpir.unInit();
	}</span>

	/**
	 * POS Tagging
	 * 
	 * @param text
	 *            The word to be marked
	 * @return String
	 */
	public static String posTagging(String text) {
		// text = Main.getNlpir().segment(text, 1);
		text = nlpir.segment(text, 1);
		String[] words = text.split(" ");
		String temp = "";
		int i = 0;
		for (String word : words) {
			if (!word.equals("")) { // Remove the extra spaces
				if (i == words.length - 1) {
					temp = temp + word;
				} else {
					temp = temp + word + " ";
				}
			}

			i++;
		}
		return temp;
	}
}

将NLPIR的Data文件夹和libNLPIRso文件放到/usr/local/workspace/NLPIR2015下（路径根据自己的设置），然后使用fatjar将程序打成jar包并放到/usr/local/workspace目录下，使用hadoop jar 命令运行程序。

常见错误：

java.lang.UnsatisfiedLinkError:Unable to load library 'win64/NLPIR': Native library(linux-x86-64/libwin64/NLPIR.so) not found in resource path

或

java.lang.UnsatisfiedLinkError: Unable to load library 'libNLPIR.so':Can't obtain InputStream for linux-x86-64/libNLPIR.so

原因：Native.loadLibrary 加载的路径不正确，或没有使用libNLPIR.so，而是Windows下的NLPIR文件。

修改后建议重启Hadoop集群。

Exceptionfrom container-launch: ExitCodeException exitCode=134: /bin/bash: line 1: 41729已放弃

原因：MR程序问题，对于要加载其他配置文件的，如这里的libNLPIR.so文件，应该在MR中进行初始化，而不是在主程序中。这里用在mapper端，因此需要在mapper端使用setup()方法进行加载配置文件初始化，map做完任务后使用cleanup()方法结束。