学习笔记:从0开始学习大数据-30. solr通过java导入doc，pdf文档建立全文检索

最新推荐文章于 2024-06-26 16:10:09 发布

领尚

最新推荐文章于 2024-06-26 16:10:09 发布

阅读量971

点赞数

分类专栏：系统集成 hadoop Hadoop 文章标签： solr java docpdf

本文链接：https://blog.csdn.net/oLinBSoft/article/details/85018895

版权

系统集成同时被 3 个专栏收录

149 篇文章 6 订阅

订阅专栏

Hadoop

46 篇文章 7 订阅

订阅专栏

hadoop

45 篇文章 0 订阅

订阅专栏

1. eclipse 新建maven项目solr，pom.xml 加入依赖

2 在项目下新建类updoctest

package com.linbin.solr;

import java.io.File;
import java.io.IOException;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

public class updoctest {
	public static String solrUrl = "http://centos7:8983/solr/mycore"; 
	
	public static void main(String[] args) throws Exception {
	//查询	findIndex1();
	//删除	deleteIndexById();
    // 导入doc文档
        String fileName = "/home/linbin/文档/能工巧匠进校园.doc";
        String solrId = "能工巧匠进校园.doc";
        indexFilesSolrCell(solrId, solrId,fileName);
	}
	
	// 查询测试
	public static void findIndex1() throws IOException, SolrServerException {
        HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build();        
        SolrQuery query = new SolrQuery();  // 创建搜索对象    
        query.set("q","*:*");        // 设置搜索条件
        query.setRows(10);         //设置每页显示多少条
        QueryResponse response = solrClient.query(query);        //发起搜索请求
        SolrDocumentList docs = response.getResults();       // 查询结果
        long cnt = docs.getNumFound();  // 查询结果总数
        System.out.println("总条数为"+cnt+"条");        
        for (SolrDocument doc : docs) {
        	// System.out.println(doc);
        	System.out.println("-------------\r\n");
         System.out.println("id:"+ doc.get("id") + ",autor:"+ doc.get("author") + ",text:"+ doc.get("text"));
        }
        solrClient.close();
    }

//删除测试	
public static  void deleteIndexById() throws IOException, SolrServerException {
        HttpSolrClient solrClient = new HttpSolrClient.Builder(solrUrl).build(); 
        //全删        //solrClient.deleteByQuery("*:*");       
        //模糊匹配删除（带有分词效果的删除）
        solrClient.deleteByQuery("id:solr-word.pdf");        
        //指定id删除        //solrClient.deleteById("1");        
        solrClient.commit();
    }

// 导入doc文档测试    
public static void indexFilesSolrCell(String fileName, String solrId, String path)
            throws IOException, SolrServerException
    {
        SolrClient solr = new HttpSolrClient.Builder(solrUrl).build();

        ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
        String contentType = getFileContentType(fileName);
        up.addFile(new File(path), contentType);
        up.setParam("literal.id", fileName);
        up.setParam("uprefix", "ignored_");
        up.setParam("fmap.content", "text");//文件内容
        up.setAction(ACTION.COMMIT, true, true);
        solr.request(up);
        System.out.println("upload ok! \r\n");
    }
    
//根据文件拓展名获取文件类型    
public static String getFileContentType(String filename) {
        String contentType = "";
        String prefix = filename.substring(filename.lastIndexOf(".") + 1);
        if (prefix.equals("xlsx")) {
            contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
        } else if (prefix.equals("pdf")) {
            contentType = "application/pdf";
        } else if (prefix.equals("doc")) {
            contentType = "application/msword";
        } else if (prefix.equals("txt")) {
            contentType = "text/plain";
        } else if (prefix.equals("xls")) {
            contentType = "application/vnd.ms-excel";
        } else if (prefix.equals("docx")) {
            contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
        } else if (prefix.equals("ppt")) {
            contentType = "application/vnd.ms-powerpoint";
        } else if (prefix.equals("pptx")) {
            contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
        }

        else {
            contentType = "othertype";
        }

        return contentType;
    }
	
}

2. 在solr的core目录下的solrconfig.xml增加如下内容：

<requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler">
<lst name="defaults">
<str name="lowernames">true</str>
<str name="uprefix">ignored_</str>

其中 <str name="uprefix">ignored_</str> 部分是把读取文件时不需要映射的字段忽略掉

<str name="fmap.content">text</str> 是把读取的fmap.content字段映射为solr的 text字段

3. 修改 managed-schema 文件，增加

这个是生成一个动态字段，类型为ignored，承接忽略的那些字段

4. 检查 solrconfig.xml

检查以上路径是否匹配，是相对于建立的mycore路径

5. 在mycore目录下建立lib目录（如果没有）

复制 solr-7.5.0/contrib/extraction/lib下的所有文件到mycore/lib目录

复制 solr-7.5.0/dist/solr-cell-7.5.0.jar 到mycore/lib目录

6.重新启动solr,如正常启动，再在eclipse 运行第1步建立的java程序