（5）使用Lucene、LingPipe做实体链接（Entity Linking）——使用Lucene构建歧义实体映射index、歧义实体上下文index

最新推荐文章于 2024-04-05 17:33:44 发布

mmc2015

最新推荐文章于 2024-04-05 17:33:44 发布

阅读量1k

点赞数 1

分类专栏：实体链接（entity linking）文章标签： Lucene LingPipe 实体链接 entity linking

本文链接：https://blog.csdn.net/mmc2015/article/details/50382646

版权

实体链接（entity linking）专栏收录该内容

8 篇文章 4 订阅

订阅专栏

使用Lucene构建歧义实体映射index：

/*
 * we need run this file as "Java Application" before this system is built
 */

//http://lucene.apache.org/core/5_3_1/demo/overview-summary.html#overview_description
public class LuceneEntityAmbiguationIndex {

	public static void main(String[] args) throws Exception
	{
		boolean create=true;
		//String docsPath="E:\\LuceneDocument\\disambiguations_preprocessing(file_contents_examples).txt";
		String docsPath="E:\\LuceneDocument\\disambiguations_preprocessing.txt";
		String indexPath="E:\\LuceneEntityAmbiguationIndex";
		final Path docDir = Paths.get(docsPath);
		if (!Files.isReadable(docDir)) 
		{
			System.out.println("Document directory '" +docDir.toAbsolutePath()+ "' does not exist or is not readable");
			System.exit(1);
		}
		
		//Directory这个类代表了Lucene的索引的存储的位置，这是一个抽象类，它目前有两个实现:
		//第一个是FSDirectory，它表示一个存储在文件系统中的索引的位置,
		//第二个是RAMDirectory，它表示一个存储在内存当中的索引的位置。
		Directory directory = FSDirectory.open(Paths.get(indexPath));
		//在一个文档被索引之前，首先需要对文档内容进行分词处理，这部分工作就是由 Analyzer来做的。
		//Analyzer类是一个抽象类，它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。
		//Analyzer把分词后的内容交给 IndexWriter来建立索引。
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
		//IndexWriter是Lucene用来创建索引的一个核心的类，他的作用是把一个个的Document对象加到索引中来。
		IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
		
		if(create)
		{
			indexWriterConfig.setOpenMode(OpenMode.CREATE);
		}
		else
		{
			indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);			
		}
		IndexDocs(indexWriter, docDir);
		
		indexWriter.close();
	}
	
	static void IndexDocs(final IndexWriter writer, Path path) throws IOException 
	{
		if(Files.isDirectory(path))
		{
			Files.walkFileTree(path, new SimpleFileVisitor<Path>(){
				@Override
				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
				{
					try
					{
						IndexDoc(writer, file);
					}
					catch(IOException ignore)
					{
						//don't index files that can't be read.
					}
					return FileVisitResult.CONTINUE;
				}
			});
		}
		else
		{
			IndexDoc(writer, path);
		}
	}
	
	public static void IndexDoc(IndexWriter writer, Path file) throws IOException
	{
		try(InputStream stream=Files.newInputStream(file))
		{
			FileReader fr=new FileReader(file.toString());
	        BufferedReader br=new BufferedReader(fr);
	        String line=null,entity=null,ambiguationEntitys=null;
	        line=br.readLine(); //skip the first line
	        while ((line=br.readLine())!=null) {
	        	entity=line.split("=>")[0];
	        	ambiguationEntitys=line.split("=>")[1];
	        	
	        	//Document是用来描述文档的，这里的文档可以指一个 HTML 页面，一封电子邮件，或者是一个文本文件。
				//一个 Document对象由多个 Field对象组成的。
				//可以把一个Document对象想象成数据库中的一个记录，而每个 Field对象就是记录的一个字段。
				Document document=new Document();
	        	//System.out.println(entity);
	        	//Field对象是用来描述一个文档的某个属性的，比如一封电子邮件的标题和内容可以用两个 Field对象分别描述。
	        	Field entityField=new StringField("entity", entity.toLowerCase(), Field.Store.YES); //lucene内容按小写存储
	        	document.add(entityField);
				//Add the contents of the file to a field named "contents".
	        	document.add(new StringField("ambiguationEntitys", ambiguationEntitys.toLowerCase(), Field.Store.YES)); //lucene内容按小写存储
	        	
	        	if(writer.getConfig().getOpenMode()==OpenMode.CREATE)
				{
					//New index, so we just add the document (no old document can be there):
					System.out.println("adding "+entity);
					writer.addDocument(document);
				}
				else
				{
					//Existing index (an old copy of this document may have been indexed) so
					//we use updateDocument instead to replace the old one matching the exact path, if present:
					System.out.println("updating "+entity);
					/*
					Term是搜索的基本单位，一个 Term对象有两个 String类型的域组成。
					生成一个 Term对象可以有如下一条语句来完成：Term term = new Term(“fieldName”,”queryWord”); 
					其中第一个参数代表了要在文档的哪一个 Field上进行查找，第二个参数代表了要查询的关键词。
					*/
					writer.updateDocument(new Term("entity", entity), document);
				}
	        }
	        System.out.println("index ambiguationEntitys complete...");
	        br.close();
	        fr.close();
	        writer.close();
		}
	}
}

使用Lucene构建歧义实体上下文index：

/*
 * we need run this file as "Java Application" before this system is built
 */

public class LuceneEntityAbstractIndex {
	public static void main(String[] args) throws Exception
	{
		boolean create=true;
		//String docsPath="E:\\LuceneDocument\\long_abstracts_preprocessing(file_contents_examples).txt";
		String docsPath="E:\\LuceneDocument\\long_abstracts_preprocessing.txt";
		String indexPath="E:\\LuceneEntityAbstractIndex";
		final Path docDir = Paths.get(docsPath);
		if (!Files.isReadable(docDir)) 
		{
			System.out.println("Document directory '" +docDir.toAbsolutePath()+ "' does not exist or is not readable");
			System.exit(1);
		}
		
		//Directory这个类代表了Lucene的索引的存储的位置，这是一个抽象类，它目前有两个实现:
		//第一个是FSDirectory，它表示一个存储在文件系统中的索引的位置,
		//第二个是RAMDirectory，它表示一个存储在内存当中的索引的位置。
		Directory directory = FSDirectory.open(Paths.get(indexPath));
		//在一个文档被索引之前，首先需要对文档内容进行分词处理，这部分工作就是由 Analyzer来做的。
		//Analyzer类是一个抽象类，它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。
		//Analyzer把分词后的内容交给 IndexWriter来建立索引。
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
		//IndexWriter是Lucene用来创建索引的一个核心的类，他的作用是把一个个的Document对象加到索引中来。
		IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
		
		if(create)
		{
			indexWriterConfig.setOpenMode(OpenMode.CREATE);
		}
		else
		{
			indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);			
		}
		IndexDocs(indexWriter, docDir);
		
		indexWriter.close();
	}
	
	static void IndexDocs(final IndexWriter writer, Path path) throws IOException 
	{
		if(Files.isDirectory(path))
		{
			Files.walkFileTree(path, new SimpleFileVisitor<Path>(){
				@Override
				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
				{
					try
					{
						IndexDoc(writer, file);
					}
					catch(IOException ignore)
					{
						//don't index files that can't be read.
					}
					return FileVisitResult.CONTINUE;
				}
			});
		}
		else
		{
			IndexDoc(writer, path);
		}
	}
	
	public static void IndexDoc(IndexWriter writer, Path file) throws IOException
	{
		try(InputStream stream=Files.newInputStream(file))
		{
			FileReader fr=new FileReader(file.toString());
	        BufferedReader br=new BufferedReader(fr);
	        String line=null,entity=null,entityAbstract=null;
	        line=br.readLine(); //skip the first line
	        while ((line=br.readLine())!=null) {
	        	entity=line.split("=>")[0];
	        	entityAbstract=line.split("=>")[1];
	        	
	        	//Document是用来描述文档的，这里的文档可以指一个 HTML 页面，一封电子邮件，或者是一个文本文件。
				//一个 Document对象由多个 Field对象组成的。
				//可以把一个Document对象想象成数据库中的一个记录，而每个 Field对象就是记录的一个字段。
				Document document=new Document();
	        	//System.out.println(entity);
	        	//Field对象是用来描述一个文档的某个属性的，比如一封电子邮件的标题和内容可以用两个 Field对象分别描述。
	        	Field entityField=new StringField("entity", entity.toLowerCase(), Field.Store.YES); //lucene内容按小写存储
	        	document.add(entityField);
				//Add the contents of the file to a field named "contents".
	        	document.add(new StringField("entityAbstract", entityAbstract.toLowerCase(), Field.Store.YES)); //lucene内容按小写存储
	        	
	        	if(writer.getConfig().getOpenMode()==OpenMode.CREATE)
				{
					//New index, so we just add the document (no old document can be there):
					System.out.println("adding "+entity);
					writer.addDocument(document);
				}
				else
				{
					//Existing index (an old copy of this document may have been indexed) so
					//we use updateDocument instead to replace the old one matching the exact path, if present:
					System.out.println("updating "+entity);
					/*
					Term是搜索的基本单位，一个 Term对象有两个 String类型的域组成。
					生成一个 Term对象可以有如下一条语句来完成：Term term = new Term(“fieldName”,”queryWord”); 
					其中第一个参数代表了要在文档的哪一个 Field上进行查找，第二个参数代表了要查询的关键词。
					*/
					writer.updateDocument(new Term("entity", entity), document);
				}
	        }
	        System.out.println("index entityAbstract complete...");
	        br.close();
	        fr.close();
	        writer.close();
		}
	}
}

为什么没有entity本身的索引？？？

因为entity本身的索引是在系统run起来之时才构建的，它使用了LingPipe而不是lucene。上面两段代码算是一个pre-“training”的过程。

使用了LingPipe构建entity索引，并进一步recognize用户输入中的entity的代码参考下一篇。

参考文献：

[1] Mendes, Pablo N, Jakob, Max, Garc&#, et al. DBpedia spotlight: Shedding light on the web of documents[C]// Proceedings of the 7th International Conference on Semantic Systems. ACM, 2011:1-8.

[2] Han X, Sun L. A Generative Entity-Mention Model for Linking Entities with Knowledge Base.[J]. Proceeding of Acl, 2011:945-954.

[3] http://lucene.apache.org/

[4] http://alias-i.com/lingpipe/demos/tutorial/ne/read-me.html

[5] http://wiki.dbpedia.org/Downloads2014

[6] http://www.oschina.net/p/jieba（结巴分词）