nutch 2.0 search accumulo solr

1. http://www.covert.io/post/18414889381/accumulo-nutch-and-gora
2. http://blog.packetloop.com/2012/03/packetpig-open-source-big-data-security.html
3. http://blog.csdn.net/lengyue365/article/details/7874003
4. http://nlp.solutions.asia/?p=232
5. http://wiki.apache.org/nutch/NewScoringIndexingExample

环境说明:
accumulo 1.5
nutch 2.0 nutchgora git-branch
hadoop 0.20.2
zookeeper 3.4.3
gora
solr 3.6.1

webpage数据说明:

<gora-orm>

<table name="webpage">
<family name="p" /> <!-- This can also have params like compression, bloom filters -->
<family name="f" />
<family name="s" />
<family name="il" />
<family name="ol" />
<family name="h" />
<family name="mtdt" />
<family name="mk" />
<config key="table.file.compress.blocksize" value="32K"/>
</table>
<class table="webpage" keyClass="java.lang.String" name="org.apache.nutch.storage.WebPage">

<!-- fetch fields -->
<field name="baseUrl" family="f" qualifier="bas"/>
<field name="status" family="f" qualifier="st"/>
<field name="prevFetchTime" family="f" qualifier="pts"/>
<field name="fetchTime" family="f" qualifier="ts"/>
<field name="fetchInterval" family="f" qualifier="fi"/>
<field name="retriesSinceFetch" family="f" qualifier="rsf"/>
<field name="reprUrl" family="f" qualifier="rpr"/>
<field name="content" family="f" qualifier="cnt"/>
<field name="contentType" family="f" qualifier="typ"/>
<field name="protocolStatus" family="f" qualifier="prot"/>
<field name="modifiedTime" family="f" qualifier="mod"/>

<!-- parse fields -->
<field name="title" family="p" qualifier="t"/>
<field name="text" family="p" qualifier="c"/>
<field name="parseStatus" family="p" qualifier="st"/>
<field name="signature" family="p" qualifier="sig"/>
<field name="prevSignature" family="p" qualifier="psig"/>

<!-- score fields -->
<field name="score" family="s" qualifier="s"/>
<field name="headers" family="h"/>
<field name="inlinks" family="il"/>
<field name="outlinks" family="ol"/>
<field name="metadata" family="mtdt"/>
<field name="markers" family="mk"/>
</class>

<table name="host">
<family name="mtdt" />
<family name="il" />
<family name="ol" />
</table>

<class table="host" keyClass="java.lang.String" name="org.apache.nutch.storage.Host">
<field name="metadata" family="mtdt"/>
<field name="inlinks" family="il"/>
<field name="outlinks" family="ol"/>
</class>

</gora-orm>

登录accumulo,查看webpage表结构:

./accumulo shell -u xxx -p xxx
root@inst> table webpage

1.在用户目录下创建名为urls的文件,加入一行:http://www.360buy.com/
执行./nutch inject ~/urls

root@inst webpage> scan -r com.360buy.www:http/
com.360buy.www:http/ f:fi [] \x00'\x8D\x00
com.360buy.www:http/ f:ts [] \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ mk:_injmrk_ [] y
com.360buy.www:http/ mtdt:_csh_ [] ?\x80\x00\x00
com.360buy.www:http/ s:s [] ?\x80\x00\x000

2. ./nutch generate

root@inst webpage> scan -r com.360buy.www:http/
com.360buy.www:http/ f:fi [] \x00'\x8D\x00
com.360buy.www:http/ f:ts [] \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ mk:_gnmrk_ [] 1349277947-925721513
com.360buy.www:http/ mk:_injmrk_ [] y
com.360buy.www:http/ mtdt:_csh_ [] ?\x80\x00\x00
com.360buy.www:http/ s:s [] ?\x80\x00\x00

3. ./nutch fetch 1349277947-925721513

root@inst webpage> scan -r com.360buy.www:http/ -f 50
com.360buy.www:http/ f:bas [] http://www.360buy.com/
com.360buy.www:http/ f:cnt [] <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Trans
com.360buy.www:http/ f:fi [] \x00'\x8D\x00
com.360buy.www:http/ f:prot [] \x02\x00\x00
com.360buy.www:http/ f:pts [] \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ f:st [] \x00\x00\x00\x02
com.360buy.www:http/ f:ts [] \x00\x00\x01:'<\x8C}
com.360buy.www:http/ f:typ [] application/xhtml+xml
com.360buy.www:http/ h:Cache-Control [] max-age=120
com.360buy.www:http/ h:Connection [] close
com.360buy.www:http/ h:Content-Encoding [] gzip
com.360buy.www:http/ h:Content-Location [] http://www.360buy.com/index.htm
com.360buy.www:http/ h:Content-Type [] text/html; charset=gb2312
com.360buy.www:http/ h:Date [] Wed, 03 Oct 2012 15:27:02 GMT
com.360buy.www:http/ h:Last-Modified [] Wed, 03 Oct 2012 15:25:57 GMT
com.360buy.www:http/ h:Server [] JDWS
com.360buy.www:http/ h:Vary [] Accept-Encoding
com.360buy.www:http/ h:X-Cache [] MISS from TJ-HY-CNC-CDN-55.360buy.com
com.360buy.www:http/ h:_ip [] 125.39.96.182
com.360buy.www:http/ mk:_ftcmrk_ [] 1349277947-925721513
com.360buy.www:http/ mk:_gnmrk_ [] 1349277947-925721513
com.360buy.www:http/ mk:_injmrk_ [] y
com.360buy.www:http/ mtdt:_csh_ [] ?\x80\x00\x00
com.360buy.www:http/ s:s [] ?\x80\x00\x00


4. ./nutch parse 1349277947-925721513

root@inst webpage> scan -r com.360buy.www:http/ -f 50
com.360buy.www:http/ f:bas [] http://www.360buy.com/
com.360buy.www:http/ f:cnt [] <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Trans
com.360buy.www:http/ f:fi [] \x00'\x8D\x00
com.360buy.www:http/ f:prot [] \x02\x00\x00
com.360buy.www:http/ f:pts [] \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ f:st [] \x00\x00\x00\x02
com.360buy.www:http/ f:ts [] \x00\x00\x01:'<\x8C}
com.360buy.www:http/ f:typ [] application/xhtml+xml
com.360buy.www:http/ h:Cache-Control [] max-age=120
com.360buy.www:http/ h:Connection [] close
com.360buy.www:http/ h:Content-Encoding [] gzip
com.360buy.www:http/ h:Content-Location [] http://www.360buy.com/index.htm
com.360buy.www:http/ h:Content-Type [] text/html; charset=gb2312
com.360buy.www:http/ h:Date [] Wed, 03 Oct 2012 15:27:02 GMT
com.360buy.www:http/ h:Last-Modified [] Wed, 03 Oct 2012 15:25:57 GMT
com.360buy.www:http/ h:Server [] JDWS
com.360buy.www:http/ h:Vary [] Accept-Encoding
com.360buy.www:http/ h:X-Cache [] MISS from TJ-HY-CNC-CDN-55.360buy.com
com.360buy.www:http/ h:_ip [] 125.39.96.182
com.360buy.www:http/ mk:__prsmrk__ [] 1349277947-925721513
com.360buy.www:http/ mk:_ftcmrk_ [] 1349277947-925721513
com.360buy.www:http/ mk:_gnmrk_ [] 1349277947-925721513
com.360buy.www:http/ mk:_injmrk_ [] y
com.360buy.www:http/ mtdt:_csh_ [] ?\x80\x00\x00
com.360buy.www:http/ ol:http://app.360buy.com/ [] \xE6\x89\x8B\xE6\x9C\xBA\xE4\xBA\xAC\xE4\xB8\x9C
---------------------------------------------- hit any key to continue or 'q' to quit ----------------------------------------------
com.360buy.www:http/ ol:http://book.360buy.com/ [] \xE5\x9B\xBE\xE4\xB9\xA6
com.360buy.www:http/ ol:http://caipiao.360buy.com/ [] \xE5\xBD\xA9\xE7\xA5\xA8
com.360buy.www:http/ ol:http://chat.360buy.com/jdchat/custom.action [] \xE5\x9C\xA8\xE7\xBA\xBF\xE5\xAE\xA2\xE6\x9C\x8D
com.360buy.www:http/ ol:http://chongzhi.360buy.com/ [] \xE5\x85\x85\xE5\x80\xBC
com.360buy.www:http/ ol:http://diy.360buy.com/ [] \xE8\xA3\x85\xE6\x9C\xBA\xE5\xA4\xA7\xE5\xB8\x88
com.360buy.www:http/ ol:http://e.360buy.com/index.html [] \xE7\x94\xB5\xE5\xAD\x90\xE4\xB9\xA6\xE5\x88\x8A
com.360buy.www:http/ ol:http://game.360buy.com/ [] \xE6\xB8\xB8\xE6\x88\x8F
com.360buy.www:http/ ol:http://help.360buy.com/ [] \xE5\xAE\xA2\xE6\x88\xB7\xE6\x9C\x8D\xE5\x8A\xA1
com.360buy.www:http/ ol:http://help.360buy.com/help/question-61.html [] \xE5\xB8\xB8\xE8\xA7\x81\xE9\x97\xAE\xE9\xA2\x98
com.360buy.www:http/ ol:http://home.360buy.com/ [] \xE6\x88\x91\xE7\x9A\x84\xE4\xBA\xAC\xE4\xB8\x9C
com.360buy.www:http/ ol:http://jd2008.360buy.com/JdHome/OrderList.aspx [] \xE6\x88\x91\xE7\x9A\x84\xE8\xAE\xA2\xE5\x8D\x95
com.360buy.www:http/ ol:http://jd2008.360buy.com/purchase/ShoppingCart.asp [] \xE5\x8E\xBB\xE8\xB4\xAD\xE7\x89\xA9\xE8\xBD\xA6\xE7\xBB\x93\xE7\xAE\x97
com.360buy.www:http/ ol:http://market.360buy.com/giftcard/ [] \xE7\xA4\xBC\xE5\x93\x81\xE5\x8D\xA1
com.360buy.www:http/ ol:http://market.360buy.com/giftcard/company/default. [] \xE4\xBC\x81\xE4\xB8\x9A\xE5\xAE\xA2\xE6\x88\xB7
com.360buy.www:http/ ol:http://mvd.360buy.com/ [] \xE9\x9F\xB3\xE5\x83\x8F
com.360buy.www:http/ ol:http://myjd.360buy.com/opinion/list.action [] \xE6\x8A\x95\xE8\xAF\x89\xE4\xB8\xAD\xE5\xBF\x83
com.360buy.www:http/ ol:http://myjd.360buy.com/repair/orderlist.action [] \xE5\x94\xAE\xE5\x90\x8E\xE6\x9C\x8D\xE5\x8A\xA1
com.360buy.www:http/ ol:http://read.360buy.com/ [] \xE5\x9C\xA8\xE7\xBA\xBF\xE8\xAF\xBB\xE4\xB9\xA6
com.360buy.www:http/ ol:http://sale.360buy.com/p10997.html [] \xE5\x8A\x9E\xE5\x85\xAC\xE7\x9B\xB4\xE9\x80\x9A\xE8\xBD\xA6
com.360buy.www:http/ ol:http://trip.360buy.com/ [] \xE6\x97\x85\xE8\xA1\x8C
com.360buy.www:http/ ol:http://www.360buy.com/ [] \xE9\xA6\x96\xE9\xA1\xB5
com.360buy.www:http/ ol:http://www.360buy.com/auto.html [] \xE6\xB1\xBD\xE8\xBD\xA6\xE7\x94\xA8\xE5\x93\x81
com.360buy.www:http/ ol:http://www.360buy.com/baby.html [] \xE6\xAF\x8D\xE5\xA9\xB4
com.360buy.www:http/ ol:http://www.360buy.com/bag.html [] \xE7\xA4\xBC\xE5\x93\x81\xE7\xAE\xB1\xE5\x8C\x85
---------------------------------------------- hit any key to continue or 'q' to quit ----------------------------------------------
com.360buy.www:http/ ol:http://www.360buy.com/beauty.html [] \xE4\xB8\xAA\xE6\x8A\xA4\xE5\x8C\x96\xE5\xA6\x86
com.360buy.www:http/ ol:http://www.360buy.com/clothing.html [] \xE6\x9C\x8D\xE9\xA5\xB0\xE9\x9E\x8B\xE5\xB8\xBD
com.360buy.www:http/ ol:http://www.360buy.com/computer.html [] \xE7\x94\xB5\xE8\x84\x91\xE3\x80\x81\xE5\x8A\x9E\xE5\x85\xAC
com.360buy.www:http/ ol:http://www.360buy.com/contact/service.html [] \xE5\xAE\xA2\xE6\x9C\x8D\xE9\x82\xAE\xE7\xAE\xB1
com.360buy.www:http/ ol:http://www.360buy.com/digital.html [] \xE6\x89\x8B\xE6\x9C\xBA\xE6\x95\xB0\xE7\xA0\x81
com.360buy.www:http/ ol:http://www.360buy.com/electronic.html [] \xE5\xAE\xB6\xE7\x94\xA8\xE7\x94\xB5\xE5\x99\xA8
com.360buy.www:http/ ol:http://www.360buy.com/food.html [] \xE9\xA3\x9F\xE5\x93\x81\xE9\xA5\xAE\xE6\x96\x99\xE3\x80\x81\xE4\xBF\x9D\xE5\x81\xA5\xE9\xA3\x9F\xE5\x93\x81
com.360buy.www:http/ ol:http://www.360buy.com/home.html [] \xE5\xAE\xB6\xE5\xB1\x85\xE5\xAE\xB6\xE8\xA3\x85
com.360buy.www:http/ ol:http://www.360buy.com/jewellery.html [] \xE7\x8F\xA0\xE5\xAE\x9D
com.360buy.www:http/ ol:http://www.360buy.com/kitchenware.html [] \xE5\x8E\xA8\xE5\x85\xB7
com.360buy.www:http/ ol:http://www.360buy.com/sports.html [] \xE8\xBF\x90\xE5\x8A\xA8\xE5\x81\xA5\xE5\xBA\xB7
com.360buy.www:http/ ol:http://www.360buy.com/toys.html [] \xE7\x8E\xA9\xE5\x85\xB7\xE4\xB9\x90\xE5\x99\xA8
com.360buy.www:http/ ol:http://www.360buy.com/watch.html [] \xE9\x92\x9F\xE8\xA1\xA8
com.360buy.www:http/ ol:http://www.360top.com/ [] 360TOP \xE5\xA5\xA2\xE4\xBE\x88\xE5\x93\x81
com.360buy.www:http/ ol:http://www.ehaoyao.com/ [] \xE4\xBA\xAC\xE4\xB8\x9C \xE5\xA5\xBD\xE8\x8D\xAF\xE5\xB8\x88
com.360buy.www:http/ ol:http://www.minitiao.com/ [] \xE8\xBF\xB7\xE4\xBD\xA0\xE6\x8C\x91
com.360buy.www:http/ ol:http://xiaoyuan.360buy.com/ [] \xE6\xA0\xA1\xE5\x9B\xAD\xE9\xA2\x91\xE9\x81\x93
com.360buy.www:http/ p:c [] \xE4\xBA\xAC\xE4\xB8\x9C\xE7\xBD\x91\xE4\xB8\x8A\xE5\x95\x86\xE5\x9F\x8E-\xE7\xBB\xBC\xE5\x90\x88\xE7\xBD\x91\xE8\xB4\xAD\xE9\xA6\x96\xE9\x80\x89\xEF\xBC\x8C\xE6\xAD\xA3\xE5\x93\x81\xE8\xA1\x8C\xE8
com.360buy.www:http/ p:sig [] HNC\xF3\x87\xEF\x8E\xD1mB\xE4\xE3\xA2\xA3\x1D\xEA
com.360buy.www:http/ p:st [] \x02\x00\x00
com.360buy.www:http/ p:t [] \xE4\xBA\xAC\xE4\xB8\x9C\xE7\xBD\x91\xE4\xB8\x8A\xE5\x95\x86\xE5\x9F\x8E-\xE7\xBB\xBC\xE5\x90\x88\xE7\xBD\x91\xE8\xB4\xAD\xE9\xA6\x96\xE9\x80\x89\xEF\xBC\x8C\xE6\xAD\xA3\xE5\x93\x81\xE8\xA1\x8C\xE8
com.360buy.www:http/ s:s [] ?\x80\x00\x00


5. ./nutch updatedb
[code="java"][/code]

6../nutch solrindex http://localhost:8983/solr/ 1349277947-925721513
./nutch solrindex http://localhost:8983/solr/ -reindex
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值