今天更新了下nutch,发现已经出了1.3版,下载时就发现只有70多M,之前的可是100多M的,肯定是什么东西删了,下载完后发现原来nutch自带的搜索功能都被删了,现在只能用solr作为索引和搜索来使用。于是就配了个solr3.2,运行一下试试,结果到建立索引时出现了[was class java.io.CharConversionException] Invalid UTF-8 character 0xffff这个错误,看样子是出现了0xffff这个非法的utf-8字符,于是网上搜索,发现这是个nutch1.3的bug,详情看这:https://issues.apache.org/jira/browse/NUTCH-1016
只需要把org.apache.nutch.indexer.solr.SolrWriter这个类改为:
public class SolrWriter implements NutchIndexWriter {
public static Log LOG = LogFactory.getLog(SolrWriter.class);
private SolrServer solr;
private SolrMappingReader solrMapping;
private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>();
private int commitSize;
public void open(JobConf job, String name) throws IOException {
solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
solrMapping = SolrMappingReader.getInstance(job);
}
public void write(NutchDocument doc) throws IOException {
final SolrInputDocument inputDoc = new SolrInputDocument();
for (final Entry<String, NutchField> e : doc) {
for (final Object val : e.getValue().getValues()) {
// normalise the string representation for a Date
Object val2 = val;
if (val instanceof Date) {
val2 = DateUtil.getThreadLocalDateFormat().format(val);
}
if (e.getKey().equals("content")) {
val2 = stripNonCharCodepoints((String) val);
}
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e
.getValue().getWeight());
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
inputDoc.addField(sCopy, val);
}
}
}
inputDoc.setDocumentBoost(doc.getWeight());
inputDocs.add(inputDoc);
if (inputDocs.size() >= commitSize) {
try {
LOG.info("Adding " + Integer.toString(inputDocs.size())
+ " documents");
solr.add(inputDocs);
} catch (final SolrServerException e) {
throw makeIOException(e);
}
inputDocs.clear();
}
}
public void close() throws IOException {
try {
if (!inputDocs.isEmpty()) {
LOG.info("Adding " + Integer.toString(inputDocs.size())
+ " documents");
solr.add(inputDocs);
inputDocs.clear();
}
// solr.commit();
} catch (final SolrServerException e) {
throw makeIOException(e);
}
}
public static IOException makeIOException(SolrServerException e) {
final IOException ioe = new IOException();
ioe.initCause(e);
return ioe;
}
public static String stripNonCharCodepoints(String input) {
StringBuilder retval = new StringBuilder();
char ch;
for (int i = 0; i < input.length(); i++) {
ch = input.charAt(i);
// Strip all non-characters
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line
// and carriage return
if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step
// 0x10000
ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
(ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
(ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
retval.append(ch);
}
}
return retval.toString();
}
}