solr_对富文本(pdf等)建立索引

<p>solrconfig.xml中定义的</p>
<p><requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"><br> <lst name="defaults"><br> <str name="fmap.Last-Modified">last_modified</str><br> <str name="uprefix">ignored_</str><br> </lst><br> <!--Optional. Specify a path to a tika configuration file. See the Tika docs for details.--><br> <str name="tika.config">/my/path/to/tika.config</str><br> <!-- Optional. Specify one or more date formats to parse. See DateUtil.DEFAULT_DATE_FORMATS for default date formats --><br> <lst name="date.formats"><br> <str>yyyy-MM-dd</str><br> </lst><br> </requestHandler></p>
<p></p>
<p></p>
<p>需要额外jar包支持:</p>
<p>apache-solr-cell-1.4.0.jar</p>
<p></p>
<p></p>
<p>请求的URL样例:</p>
<p><a href="http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My">http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My</a> Files&commit=true&stream.url=http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&Expires=1281444093&Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&oh=s3.amazonaws.com</p>
<p></p>
<p>/solr/update/extract 请求url</p>
<p>literal.id=22 相当于XML中定义的<field name="id">22</field></p>
<p>commit=true 直接提交事务</p>
<p>stream.url 指向富文本文件URL ,这里URL是有问题的,里面的& 符号未被处理,提交会失败。</p>
<p>相对的还有stream.file </p>
<p></p>
<p>详细看WIKI</p>
<p><a href="http://wiki.apache.org/solr/ExtractingRequestHandler">http://wiki.apache.org/solr/ExtractingRequestHandler</a></p>
<p></p>
<p>简单样例:</p>
<p></p>
<p>里面url是可用的,其他URL都没经过URLEncoding.url,都有问题</p>
<p></p>
<p>这里的例子中,都使用HttpURLConnection链接,level有点低,可以考虑使用appach的HTTPClient,帮助解决多线程安全等问题,更安全可靠。</p>
<p><textarea cols="101" rows="15" name="code" class="java:collapse">public class simpleClassFotTest {
/**
* @param args
* @throws IOException
* @throws URISyntaxException
*/
public static void main(String[] args) throws IOException, URISyntaxException {
try{
URI uri = new URI("http://localhost:8080/SolrDemo/indexDir/1280998963663_aldern.txt");
String url = "http://16.158.149.182:8080/solr/update/extract?literal.id=21" +
"&literal.name=aldern&literal.fullname=aldern.txt&literal.type=file&literal.size=32" +
"&literal.datecreated=1281433365403&literal.datelastmodify=1281433365403&literal.userid=1" +
"&literal.location=MyFiles&commit=true" +
"&stream.url=http%3A%2F%2Flocalhost%3A8080%2Fkumulus_data%2Fkumulus_local%2Fuid1%2F1281433365403%3FAWSAccessKeyId%3DAKIAJB36UFEVFFJ3P4TQ%26Expires%3D1281447779%26Signature%3DL6BlDV%252BviBuL78vF2V5QPq3HTDU%253D%26oh%3Ds3.amazonaws.com";
String url2 = "http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My Files&commit=true&stream.url=http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&Expires=1281444093&Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&oh=s3.amazonaws.com";
String url3 = "http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My Files&commit=true&stream.url=";
String param = "http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&Expires=1281444093&Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&oh=s3.amazonaws.com";
param = URLEncoder.encode(param,"ASCII");
HttpURLConnection urlc = null;
System.out.println(url);
urlc = (HttpURLConnection) new URL(url).openConnection();
urlc.setDoOutput(true);
urlc.setDoInput(true);

InputStream in = urlc.getInputStream();
String result = convertStreamToString(in);
System.out.println(result);
}catch (Exception e) {
e.printStackTrace();
}
}
public static String convertStreamToString(InputStream is) {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
StringBuilder sb = new StringBuilder();

String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "/n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
}
</textarea></p>
<p></p>
<p></p>
<p>使用传统XML提交索引例子:</p>
<p>/update</p>
<p><textarea cols="103" rows="15" name="code" class="c-sharp">public class simpleClassForTest2 {
public static void main(String[] args) throws MalformedURLException, IOException{
String url = "http://localhost:8080/solr/update";
Document dom = DocumentHelper.createDocument();
Element add =dom.addElement("add");
Element doc = add.addElement("doc");
Element field_id = doc.addElement("field");
field_id.addAttribute("name", "id");
field_id.setText("50");

//Element field_1 = doc.addElement("field");
//field_1.addAttribute("name", "name");
//field_1.setText("ddt");
Element field_3 = doc.addElement("field");
field_3.addAttribute("name", "size");
field_3.setText("11");
Element field_2 = doc.addElement("field");
field_2.addAttribute("name", "text");
field_2.setText("ddt");
String xml = "";
xml = dom.asXML();
xml = xml.replace("<?xml version=/"1.0/" encoding=/"UTF-8/"?>", "");
StringWriter sw = new StringWriter();

HttpURLConnection urlc = (HttpURLConnection) new URL(url).openConnection();

urlc.setRequestMethod("GET");
urlc.setDoOutput(true);
urlc.setDoInput(true);
urlc.setUseCaches(false);
urlc.setAllowUserInteraction(false);
urlc.setRequestProperty("Content-type", "text/html; charset=UTF-8");
ByteArrayInputStream stream = new ByteArrayInputStream(xml.getBytes());
postFile(stream, sw);

System.out.println("reuslt :" + sw.toString());
postData(new StringReader("<commit/>"), sw);
System.out.println("commit : "+sw.toString());

}
public static void postFile(InputStream stream, Writer output)
throws FileNotFoundException, UnsupportedEncodingException{
Reader reader = new InputStreamReader(stream, "UTF-8");
try
{
postData(reader, output);
}
finally
{
try
{
if(reader != null)
reader.close();
}
catch(IOException e)
{
e.printStackTrace();//throw new PostException("IOException while closing file", e);
}
}
}
public static String convertStreamToString(InputStream is) {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
StringBuilder sb = new StringBuilder();

String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "/n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}

return sb.toString();
}
public static void postData(Reader data, Writer output)
{
HttpURLConnection urlc = null;
try
{
URL u = new URL("http://localhost:8080/solr/update");
urlc = (HttpURLConnection)u.openConnection();
try
{
urlc.setRequestMethod("POST");
}
catch(ProtocolException e)
{
e.printStackTrace();
}
urlc.setDoOutput(true);//允许输出
urlc.setDoInput(true);//允许输入
urlc.setUseCaches(false);//不使用缓存(如浏览器中的缓存优化 )
urlc.setAllowUserInteraction(false);
urlc.setRequestProperty("Content-type", "text/xml; charset=UTF-8");
OutputStream out = urlc.getOutputStream();
try
{
Writer writer = new OutputStreamWriter(out, "UTF-8");
pipe(data, writer);
writer.close();
}
catch(IOException e)
{
e.printStackTrace();
}
finally
{
if(out != null)
out.close();
}
InputStream in = urlc.getInputStream();
try
{
Reader reader = new InputStreamReader(in);
pipe(reader, output);
reader.close();
}
catch(IOException e)
{
e.printStackTrace();
}
finally
{
if(in != null)
in.close();
}
}
catch(IOException e)
{
e.printStackTrace();
}
finally
{
if(urlc != null)
urlc.disconnect();
}
}
private static void pipe(Reader reader, Writer writer)throws IOException {
char buf[] = new char[1024];
for(int read = 0; (read = reader.read(buf)) >= 0;)
writer.write(buf, 0, read);

writer.flush();
}
}
</textarea></p>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值