nutch爬取网页,通过solr建立索引,solr schema.xml的配置如下:
<fields> <field name="url" type="string" indexed="true" stored="true"/> <field name="content" type="text" indexed="true" stored="true"/> <field name="segment" type="string" indexed="false" stored="true"/> <field name="digest" type="string" indexed="false" stored="true"/> <field name="host" type="string" indexed="true" stored="false"/> <field name="site" type="string" indexed="true" stored="false"/> <field name="anchor" type="string" indexed="true" stored="false" multiValued="true"/> <field name="title" type="text" indexed="true" stored="true"/> <field name="tstamp" type="slong" indexed="false" stored="true"/> <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> </fields> <uniqueKey>url</uniqueKey> <defaultSearchField>text</defaultSearchField> <solrQueryParser defaultOperator="AND"/> <copyField source="anchor" dest="text"/> <copyField source="title" dest="text"/> <copyField source="content" dest="text"/>
分词采用IKanalyer
<fields> <field name="url" type="string" indexed="true" stored="true"/> <field name="content" type="text" indexed="true" stored="true"/> <field name="segment" type="string" indexed="false" stored="true"/> <field name="digest" type="string" indexed="false" stored="true"/> <field name="host" type="string" indexed="true" stored="false"/> <field name="site" type="string" indexed="true" stored="false"/> <field name="anchor" type="string" indexed="true" stored="false" multiValued="true"/> <field name="title" type="text" indexed="true" stored="true"/> <field name="tstamp" type="slong" indexed="false" stored="true"/> <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> </fields> <uniqueKey>url</uniqueKey> <defaultSearchField>text</defaultSearchField> <solrQueryParser defaultOperator="AND"/> <copyField source="anchor" dest="text"/> <copyField source="title" dest="text"/> <copyField source="content" dest="text"/>
分词采用IKanalyer