最近项目需要做pdf,word等的检索,一路走来有许多坑,也有许多收获,特此做一下记录.
这篇主要是对数据库的操作:
1.solr分词器的安装,http://download.csdn.net/download/u011277123/9994225,下载后参照此篇文章 http://www.cnblogs.com/zuge/p/6001508.html
2.在solr里面主要的就是solr的主目录下面的schema.xml,solrConfig.xml,mysql-data-config.xml,dataimport.properties
solrConfig.xml主要配置了数据库的配置文件
-
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
-
<lst name="defaults">
-
<str name="config">mysql-data-config.xml</str>
-
</lst>
-
</requestHandler>
mysql-data-config.xml具体的数据库配置
-
<dataConfig>
-
<dataSource driver="com.mysql.jdbc.Driver" url="jdbc:mysql://10.128.90.161:3306/test" user="root" password="root123" />
-
<document>
-
<entity name="medi_doctors" pk="ID_"
-
query="select * from act_id_user"
-
deltaImportQuery="SELECT * FROM act_id_user where ID_ ='${dih.delta.ID_}'"
-
deltaQuery="SELECT ID_ FROM act_id_user where UpdateTime > '${dataimporter.medi_doctors.last_index_time}'">
-
<field column="ID_" name="id" />
-
<field column="REV_" name="rev" />
-
<field column="FIRST_" name="first" />
-
<field column="LAST_" name="last" />
-
<field column="EMAIL_" name="email" />
-
<field column="PWD_" name="pwd" />
-
<field column="PICTURE_ID_" name="pic" />
-
<field column="UpdateTime" name="updateTime" />
-
</entity>
-
</document>
-
</dataConfig>
schema.xml表对应字段的配置
-
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
-
<field name="rev" type="string" indexed="true" stored="true" multiValued="false" />
-
<field name="first" type="string" indexed="true" stored="true" multiValued="false" />
-
<field name="last" type="string" indexed="true" stored="true" multiValued="false" />
-
<field name="email" type="string" indexed="true" stored="true" multiValued="false" />
-
<field name="pwd" type="string" indexed="true" stored="true" multiValued="false" />
-
<field name="pic" type="string" indexed="true" stored="true" multiValued="false" />
-
<field name="updateTime" type="date" indexed="true" stored="true" />
-
<uniqueKey>id</uniqueKey>
dataimport.properties记录最后更改的时间,与mysql-data-config.xml的查询语句对应
-
medi_doctors.last_index_time=2017-09-01 08\:59\:17
-
last_index_time=2017-09-01 08\:59\:17
==================================
这篇主要是对PDF文件的操作:
1.solrconfig.xml中添加关于PDF的配置文件信息,
<lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-cell-\d.*\.jar" />
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
</requestHandler>
2.创建data-config.xml:
-
<dataConfig>
-
<script><![CDATA[
-
id = 1;
-
function GenerateId(row) {
-
row.put('id', (id ++).toFixed());
-
return row;
-
}
-
function WipOffHtml(row) {
-
var file = row.get('file');
-
row.put('file',file.substr(0,file.indexOf('.')));
-
return row;
-
}
-
]]>
-
</script>
-
<dataSource type="BinFileDataSource" />
-
<document>
-
<entity name="file_a" processor="FileListEntityProcessor" dataSource="null" rootEntity="false"
-
baseDir="索引文件所在目录" fileName=".*.pdf">
-
<field column="fileAbsolutePath" name="filePath" />
-
<field column="fileSize" name="size" />
-
<field column="fileLastModified" name="lastModified" />
-
<entity name="documentImport" processor="TikaEntityProcessor" url="${file_a.fileAbsolutePath}"
-
format="text" transformer="HTMLStripTransformer,RegexTransformer,script:GenerateId">
-
<field column="Author" name="author" meta="true"/>
-
<field column="title" name="title" meta="true"/>
-
<field column="text" name="text" stripHTML="true" regex="\t|\r|\n|\s" replaceWith=""/>
-
</entity>
-
</entity>
-
</document>
-
</dataConfig>
3: 编辑 managed-schema添加如下配置:
-
<field name="text" type="text_ik" indexed="true" stored="true" omitNorms="true" multiValued="false"/>
-
<field name="author" type="string" indexed="true" stored="true"/>
-
<field name="title" type="string" indexed="true" stored="true"/>
-
<field name="filePath" type="string" stored="true" multiValued="false" />
-
<field name="size" type="long" indexed="true" stored="true" />
-
<field name="lastModified" type="date" indexed="true" stored="true" />
-
<fieldType name="text_ik" class="solr.TextField">
-
<analyzer type="index" isMaxWordLength="false" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
-
<analyzer type="query" isMaxWordLength="true" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
-
</fieldType>
其中 fileName=".*.(doc)|(pdf)|(xls)|(ppt)|(docx)" 还可对这些文件进行配置。
同事杨小清的总结:
http://note.youdao.com/share/?id=cd683d660157b1dcfb8af75baebd5ece&type=note#/
我之前本地虚拟机是4.X,现在是5.X,所以分词器要选对应的版本,下面是我安装的分词器:
http://blog.csdn.net/jiangchao858/article/details/53153272