nutch1.3和solr3.x集成时出现Invalid UTF-8 character问题_[was class java.io.charconversionexception] invali-CSDN博客

本文链接：https://blog.csdn.net/laigood/article/details/6689611

今天更新了下nutch，发现已经出了1.3版，下载时就发现只有70多M，之前的可是100多M的，肯定是什么东西删了，下载完后发现原来nutch自带的搜索功能都被删了，现在只能用solr作为索引和搜索来使用。于是就配了个solr3.2，运行一下试试，结果到建立索引时出现了[was class java.io.CharConversionException] Invalid UTF-8 character 0xffff这个错误，看样子是出现了0xffff这个非法的utf-8字符，于是网上搜索，发现这是个nutch1.3的bug，详情看这：https://issues.apache.org/jira/browse/NUTCH-1016

只需要把org.apache.nutch.indexer.solr.SolrWriter这个类改为：

public class SolrWriter implements NutchIndexWriter {
	public static Log LOG = LogFactory.getLog(SolrWriter.class);
	private SolrServer solr;
	private SolrMappingReader solrMapping;

	private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>();

	private int commitSize;

	public void open(JobConf job, String name) throws IOException {
		solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
		commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
		solrMapping = SolrMappingReader.getInstance(job);
	}

	public void write(NutchDocument doc) throws IOException {
		final SolrInputDocument inputDoc = new SolrInputDocument();
		for (final Entry<String, NutchField> e : doc) {
			for (final Object val : e.getValue().getValues()) {
				// normalise the string representation for a Date
				Object val2 = val;

				if (val instanceof Date) {
					val2 = DateUtil.getThreadLocalDateFormat().format(val);
				}

				if (e.getKey().equals("content")) {
					val2 = stripNonCharCodepoints((String) val);
				}
				inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e
						.getValue().getWeight());
				String sCopy = solrMapping.mapCopyKey(e.getKey());
				if (sCopy != e.getKey()) {
					inputDoc.addField(sCopy, val);
				}
			}
		}
		inputDoc.setDocumentBoost(doc.getWeight());
		inputDocs.add(inputDoc);
		if (inputDocs.size() >= commitSize) {
			try {
				LOG.info("Adding " + Integer.toString(inputDocs.size())
						+ " documents");
				solr.add(inputDocs);
			} catch (final SolrServerException e) {
				throw makeIOException(e);
			}
			inputDocs.clear();
		}
	}

	public void close() throws IOException {
		try {
			if (!inputDocs.isEmpty()) {
				LOG.info("Adding " + Integer.toString(inputDocs.size())
						+ " documents");
				solr.add(inputDocs);
				inputDocs.clear();
			}
			// solr.commit();
		} catch (final SolrServerException e) {
			throw makeIOException(e);
		}
	}

	public static IOException makeIOException(SolrServerException e) {
		final IOException ioe = new IOException();
		ioe.initCause(e);
		return ioe;
	}

	public static String stripNonCharCodepoints(String input) {
		StringBuilder retval = new StringBuilder();
		char ch;

		for (int i = 0; i < input.length(); i++) {
			ch = input.charAt(i);

			// Strip all non-characters
			// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
			// and non-printable control characters except tabulator, new line
			// and carriage return
			if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step
											// 0x10000
					ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
					(ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
					(ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {

				retval.append(ch);
			}
		}

		return retval.toString();
	}
}