前面简单介绍了索引的主要组成部分,本篇主要是记录下导入数据构建索引的几种配置方法。
导入数据的命令:bin/solr -e dih
待索引数据主要分为三类:数据库数据、文件数据、网页数据。这三类数据对应的solrconfig.xml的配置是一致的,修改solrconfig.xml,增加所需lib包及以下配置:
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" />
<requestHandler name="/dataimport" class="solr.DataImportHandler">
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
</requestHandler>
数据库数据:对应的data-config.xml配置如下:
<dataConfig>
<!-- The first element is the dataSource, in this case an HSQLDB database.
The path to the JDBC driver and the JDBC URL and login credentials are all specified here.
Other permissible attributes include whether or not to autocommit to Solr, the batchsize
used in the JDBC connection, a 'readOnly' flag.
The password attribute is optional if there is no password set for the DB.
-->
<dataSource driver="org.hsqldb.jdbcDriver" url="jdbc:hsqldb:./example-DIH/hsqldb/ex" user="sa" password="secret"/>
<!--
Alternately the password can be encrypted as follows. This is the value obtained as a result of the command
openssl enc -aes-128-cbc -a -salt -in pwd.txt
password="U2FsdGVkX18QMjY0yfCqlfBMvAB4d3XkwY96L7gfO2o="
WHen the password is encrypted, you must provide an extra attribute
encryptKeyFile="/location/of/encryptionkey"
This file should a text file with a single line containing the encrypt/decrypt password
-->
<!-- A 'document' element follows, containing multiple 'entity' elements.
Note that 'entity' elements can be nested, and this allows the entity
relationships in the sample database to be mirrored here, so that we can
generate a denormalized Solr record which may include multiple features
for one item, for instance -->
<document>
<!-- The possible attributes for the entity element are described below.
Entity elements may contain one or more 'field' elements, which map
the data source field names to Solr fields, and optionally specify
per-field transformations -->
<!-- this entity is the 'root' entity. -->
<entity name="item" query="select * from item"
deltaQuery="select id from item where last_modified > '${dataimporter.last_index_time}'">
<field column="NAME" name="name" />
<!-- This entity is nested and reflects the one-to-many relationship between an item and its multiple features.
Note the use of variables; ${item.ID} is the value of the column 'ID' for the current item
('item' referring to the entity name) -->
<entity name="feature"
query="select DESCRIPTION from FEATURE where ITEM_ID='${item.ID}'"
deltaQuery="select ITEM_ID from FEATURE where last_modified > '${dataimporter.last_index_time}'"
parentDeltaQuery="select ID from item where ID=${feature.ITEM_ID}">
<field name="features" column="DESCRIPTION" />
</entity>
<entity name="item_category"
query="select CATEGORY_ID from item_category where ITEM_ID='${item.ID}'"
deltaQuery="select ITEM_ID, CATEGORY_ID from item_category where last_modified > '${dataimporter.last_index_time}'"
parentDeltaQuery="select ID from item where ID=${item_category.ITEM_ID}">
<entity name="category"
query="select DESCRIPTION from category where ID = '${item_category.CATEGORY_ID}'"
deltaQuery="select ID from category where last_modified > '${dataimporter.last_index_time}'"
parentDeltaQuery="select ITEM_ID, CATEGORY_ID from item_category where CATEGORY_ID=${category.ID}">
<field column="description" name="cat" />
</entity>
</entity>
</entity>
</document>
</dataConfig>
文件数据:对应的data-config.xml配置如下:
<dataConfig>
<dataSource type="BinFileDataSource" />
<document>
<entity name="tika-test" processor="TikaEntityProcessor"
url="${solr.install.dir}/example/exampledocs/solr-word.pdf" format="text">
<field column="Author" name="author" meta="true"/>
<field column="title" name="title" meta="true"/>
<field column="text" name="text"/>
</entity>
</document>
</dataConfig>
网页数据:对应的data-config.xml配置如下:
<dataConfig>
<dataSource type="HttpDataSource" />
<document>
<entity name="slashdot"
pk="link"
url="http://rss.slashdot.org/Slashdot/slashdot"
processor="XPathEntityProcessor"
<!-- forEach sets up a processing loop ; here there are two expressions-->
forEach="/RDF/channel | /RDF/item"
transformer="DateFormatTransformer">
<field column="source" xpath="/RDF/channel/title" commonField="true" />
<field column="source-link" xpath="/RDF/channel/link" commonField="true"/>
<field column="subject" xpath="/RDF/channel/subject" commonField="true" />
<field column="title" xpath="/RDF/item/title" />
<field column="link" xpath="/RDF/item/link" />
<field column="description" xpath="/RDF/item/description" />
<field column="creator" xpath="/RDF/item/creator" />
<field column="item-subject" xpath="/RDF/item/subject" />
<field column="date" xpath="/RDF/item/date"
dateTimeFormat="yyyy-MM-dd'T'hh:mm:ss" />
<field column="slash-department" xpath="/RDF/item/department" />
<field column="slash-section" xpath="/RDF/item/section" />
<field column="slash-comments" xpath="/RDF/item/comments" />
</entity>
</document>
</dataConfig>
以上几个配置可以看出,所有的数据导入都是由DataSource以及EntityProcessor两个对象来进行处理,需要熟悉两个类及其子类。
参考:https://cwiki.apache.org/confluence/display/solr/Uploading+Structured+Data+Store+Data+with+the+Data+Import+Handler