Analysis csdn
====== search criteria
<div class="interact">
<a href="http://my.csdn.net/my/letter/send/cping1982" class="letter" title="[发私信]"></a>
<!--<a href="#" class="attented" title="已关注"></a>-->
<a href="#" class="attent" id="span_add_follow" title="[加关注]"></a>
</div>
<div id="blog_medal">
</div>
<ul id="blog_rank">
<li>访问:<span>1103783次</span></li>
<li>积分:<span>16734分</span></li>
<li>排名:<span>第72名</span></li>
</ul>
<ul id="blog_statistics">
<li>原创:<span>310篇</span></li>
<li>转载:<span>65篇</span></li>
<li>译文:<span>3篇</span></li>
<li>评论:<span>4894条</span></li>
</ul>
</ul>
</div>
====
<div id="panel_Category" class="panel">
<ul class="panel_head"><span>文章分类</span></ul>
<ul class="panel_body">
<li>
<a href="http://blog.csdn.net/cping1982/article/category/447055">JAVA应用</a><span>(79)</span>
</li>
<li>
<a href="http://blog.csdn.net/cping1982/article/category/455610">JAVA游戏开发</a><span>(58)</span>
</li>
<li>
<a href="http://blog.csdn.net/cping1982/article/category/403546">原创小说</a><span>(46)</span>
</li>
<li>
<a href="http://blog.csdn.net/cping1982/article/category/496919">技术文章</a><span>(61)</span>
</li>
<li>
<a href="http://blog.csdn.net/cping1982/article/category/634934">杂文杂记</a><span>(50)</span>
</li>
<li>
<a href="http://blog.csdn.net/cping1982/article/category/530863">转载文章</a><span>(7)</span>
</li>
<li>
<a href="http://blog.csdn.net/cping1982/article/category/865921">Android移植</a><span>(3)</span>
</li>
</ul>
</div>
===
<div id="hotarticls" class="panel">
<ul class="panel_head"><span>阅读排行</span></ul>
<ul class="panel_body">
<li>
<a href="/cping1982/article/details/2166968" title="ExtJS2.0开发与实践笔记[0]——初识ExtJS">ExtJS2.0开发与实践笔记[0]——...</a> (31772)
</li>
<li>
<a href="/cping1982/article/details/6176191" title="Android游戏框架Libgdx使用入门">Android游戏框架Libgdx使用入...</a> (27603)
</li>
<li>
<a href="/cping1982/article/details/6460357" title="浅谈2011年上半年Java游戏领域动态">浅谈2011年上半年Java游戏领域动态</a> (25388)
</li>
<li>
<a href="/cping1982/article/details/5186072" title="未睹棺椁先哭君——谷歌墓志铭">未睹棺椁先哭君——谷歌墓志铭</a> (22452)
</li>
<li>
<a href="/cping1982/article/details/6227775" title="Android游戏框架AndEngine使用入门">Android游戏框架AndEngine...</a> (21613)
</li>
<li>
<a href="/cping1982/article/details/1931539" title="浅谈java.util.concurrent包的并发处理">浅谈java.util.concurre...</a> (17942)
</li>
<li>
<a href="/cping1982/article/details/6072188" title="为什么没有好用的Android游戏引擎?">为什么没有好用的Android游戏引擎?</a> (17522)
</li>
<li>
<a href="/cping1982/article/details/2806598" title="Java&.Net虚拟机精简(GreenJVM&GreenDotNet发布)">Java&.Net虚拟机精简(Green...</a> (16850)
</li>
<li>
<a href="/cping1982/article/details/6006760" title="Android游戏开发示例——弹幕+战棋">Android游戏开发示例——弹幕+战棋</a> (16316)
</li>
<li>
<a href="/cping1982/article/details/1869430" title="中国本土化编程(汉语编程)之我见">中国本土化编程(汉语编程)之我见</a> (15610)
</li>
</ul>
</div>
= one item
<div class="list_item article_item">
<div class="article_title">
<span class="ico ico_type_Repost"></span>
<h3>
<span class="link_title"><a href="/garyyding/article/details/7063265">
Learn JOGL
</a></span>
</h3>
</div>
<div class="article_description">
wei495715356 has some introduce for Nehe
http://www.iteye.com/topic/671095
HCQmaker has some course for OpenGL
http://hcqmaker.iteye.com/blog/241320
wjyjimy has some course for OpenGL
h... </div>
<div class="article_manage">
<span class="link_postdate">2011-12-12 13:29</span>
<span class="link_view" title="阅读次数"><a href="/garyyding/article/details/7063265" title="阅读次数">阅读</a>(27)</span>
<span class="link_comments" title="评论次数"><a href="/garyyding/article/details/7063265#comments" title="评论次数">评论</a>(0)</span>
<span class="link_edit"><a href="http://write.blog.csdn.net/postedit/7063265" title="编辑">编辑</a></span>
<span class="link_delete"><a href="javascript:void(0);" οnclick="javascript:deleteArticle(7063265);return false;" title="删除">删除</a></span>
</div>
Code of web-Harvest
<?xml version="1.0" encoding="UTF-8"?>
<!-- Expects following initial variable: search - search expression -->
<config charset="UTF-8">
<include path="functions.xml" />
<!-- defines search keyword and start URL -->
<var-def name="searchResultObject" overwrite="false">
<template>${searchResult.getWrappedObject()}</template>
</var-def>
<var-def name="currentUser" overwrite="false">
<template>${searchResult.getWrappedObject().user}</template>
</var-def>
<var-def name="targetWebsite" overwrite="false">
<template>${searchResult.getWrappedObject().rootWebSite}/${searchResult.getWrappedObject().user}
</template>
</var-def>
<var-def name="rootWebsite" overwrite="false">
<template>${searchResult.getWrappedObject().rootWebSite}</template>
</var-def>
<var-def name="doc">
<html-to-xml>
<http url="${targetWebsite}" />
</html-to-xml>
</var-def>
<var-def name="interact">
<xpath expression="//ul[@class='panel_body profile']">
<var name="doc" />
</xpath>
</var-def>
<var-def name="categories">
<xpath expression="//div[@id='panel_Category']/ul[@class='panel_body']/li">
<var name="doc" />
</xpath>
</var-def>
<var-def name="fileArchive">
<xpath expression="//div[@id='panel_Archive']/ul[@class='panel_body']/div[@id='archive_list']/li">
<var name="doc" />
</xpath>
</var-def>
<var-def name="interact_fangwen">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_rank']/li[1]</call-param>
</call>
</var-def>
<var-def name="interact_jifen">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_rank']/li[2]</call-param>
</call>
</var-def>
<var-def name="interact_paiming">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_rank']/li[3]</call-param>
</call>
</var-def>
<var-def name="interact_original">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_statistics']/li[1]
</call-param>
</call>
</var-def>
<var-def name="interact_get">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_statistics']/li[2]
</call-param>
</call>
</var-def>
<var-def name="interact_translate">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_statistics']/li[3]
</call-param>
</call>
</var-def>
<var-def name="interact_comment">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content">
<var name="interact" />
</call-param>
<call-param name="itemPath">//ul[@id='blog_statistics']/li[4]
</call-param>
</call>
</var-def>
<script><![CDATA[
Object o=searchResult.getWrappedObject();
o.addProfile(interact_jifen.toString(),interact_paiming.toString(),interact_fangwen.toString(),interact_original.toString(),interact_get.toString(),interact_translate.toString(),interact_comment.toString());
]]></script>
<loop item="category" index="i" filter="unique">
<list>
<var name="categories" />
</list>
<body>
<var-def name="title">
<xpath expression="//a/text()">
<var name="category" />
</xpath>
</var-def>
<var-def name="u">
<xpath expression="//a/@href[1]">
<var name="category" />
</xpath>
</var-def>
<script><![CDATA[
Object o=searchResult.getWrappedObject();
o.addFileCategory(title.toString(),u.toString());
]]></script>
<var-def name="category_doc">
<xpath expression="//div[@class='list_item article_item']">
<html-to-xml>
<http url="${u}" />
</html-to-xml>
</xpath>
</var-def>
<loop item="onecategory" index="j" filter="unique">
<list>
<var name="category_doc" />
</list>
<body>
<var-def name="f_datetime">
<xpath
expression="//div[@class='article_manage']/span[@class='link_postdate']/text()">
<var name="onecategory" />
</xpath>
</var-def>
<var-def name="f_link_view">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content"><var name="onecategory" /></call-param>
<call-param name="itemPath">//div[@class='article_manage']/span[@class='link_view']/text()</call-param>
</call>
</var-def>
<var-def name="f_link_comments">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content"><var name="onecategory" /></call-param>
<call-param name="itemPath">//div[@class='article_manage']/span[@class='link_comments']/text()</call-param>
</call>
</var-def>
<var-def name="f_name">
<xpath
expression="//div[@class='article_title']/h3/span[@class='link_title']/a/text()">
<var name="onecategory" />
</xpath>
</var-def>
<var-def name="f_url">
<xpath
expression="//div[@class='article_title']/h3/span[@class='link_title']/a/@href">
<var name="onecategory" />
</xpath>
</var-def>
<var-def name="f_description">
<xpath expression="//div[@class='article_description']/text()">
<var name="onecategory" />
</xpath>
</var-def>
<script><![CDATA[
Object o=searchResult.getWrappedObject();
o.addFileIntoCategory(f_name.toString(),rootWebsite.toString()+f_url.toString(),title.toString(),f_datetime.toString(),f_link_view.toString(),f_link_comments.toString(),f_description.toString());
]]></script>
</body>
</loop>
</body>
</loop>
<loop item="category" index="i" filter="unique">
<list>
<var name="fileArchive" />
</list>
<body>
<var-def name="title">
<xpath expression="//a/text()">
<var name="category" />
</xpath>
</var-def>
<var-def name="u" overwrite="true">
<xpath expression="//a/@href[1]">
<var name="category" />
</xpath>
</var-def>
<script><![CDATA[
Object o=searchResult.getWrappedObject();
o.addFileArchive(title.toString(),u.toString());
]]></script>
<var-def name="category_doc">
<xpath expression="//div[@class='list_item article_item']">
<html-to-xml>
<http url="${u}" />
</html-to-xml>
</xpath>
</var-def>
<loop item="onecategory" index="j" filter="unique">
<list>
<var name="category_doc" />
</list>
<body>
<var-def name="f_datetime">
<xpath
expression="//div[@class='article_manage']/span[@class='link_postdate']/text()">
<var name="onecategory" />
</xpath>
</var-def>
<var-def name="f_link_view">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content"><var name="onecategory" /></call-param>
<call-param name="itemPath">//div[@class='article_manage']/span[@class='link_view']/text()</call-param>
</call>
</var-def>
<var-def name="f_link_comments">
<call name="regexp">
<call-param name="rule">[\d]+</call-param>
<call-param name="content"><var name="onecategory" /></call-param>
<call-param name="itemPath">//div[@class='article_manage']/span[@class='link_comments']/text()</call-param>
</call>
</var-def>
<var-def name="f_name">
<xpath
expression="//div[@class='article_title']/h3/span[@class='link_title']/a/text()">
<var name="onecategory" />
</xpath>
</var-def>
<var-def name="f_url">
<xpath
expression="//div[@class='article_title']/h3/span[@class='link_title']/a/@href">
<var name="onecategory" />
</xpath>
</var-def>
<var-def name="f_description">
<xpath expression="//div[@class='article_description']/text()">
<var name="onecategory" />
</xpath>
</var-def>
<script><![CDATA[
Object o=searchResult.getWrappedObject();
o.addFileIntoFileArchives(f_name.toString(),rootWebsite.toString()+f_url.toString(),title.toString(),f_datetime.toString(),f_link_view.toString(),f_link_comments.toString(),f_description.toString());
]]></script>
</body>
</loop>
</body>
</loop>
<script><![CDATA[
SetContextVar("categories", searchResult.getWrappedObject().getCategories());
SetContextVar("fileArchive", searchResult.getWrappedObject().getFileArchives());
]]></script>
<file action="write" path="csdn_${currentUser}.xml" charset="UTF-8">
<template>
<![CDATA[ <user name="${currentUser}"> <categories>]]>
</template>
<loop item="category" index="i" filter="unique">
<list>
<var name="categories" />
</list>
<body>
<template><![CDATA[
<category name="${category.getWrappedObject().getName()}" url="${category.getWrappedObject().getUrl()}"> ]]>
</template>
<script><![CDATA[
SetContextVar("files", category.getWrappedObject().getFiles());
]]></script>
<loop item="file" index="i" filter="unique">
<list>
<var name="files" />
</list>
<body>
<template><![CDATA[
<article name="${file.getWrappedObject().getFileName()}" url="${file.getWrappedObject().getFileUrl()}" dateTime="${file.getWrappedObject().getFileDateTime()}" read="${file.getWrappedObject().getCount_read()}" comments="${file.getWrappedObject().getCount_comment()}">
<description>
${file.getWrappedObject().getDescription()}
</description>
</article>
]]></template>
</body>
</loop>
<template><![CDATA[ </category> ]]></template>
</body>
</loop>
<![CDATA[ </categories><archives> ]]>
<![CDATA[ <archives> ]]>
<loop item="category" index="i" filter="unique">
<list>
<var name="fileArchive" />
</list>
<body>
<template><![CDATA[
<archive name="${category.getWrappedObject().getName()}" url="${category.getWrappedObject().getUrl()}"> ]]>
</template>
<script><![CDATA[
SetContextVar("files", category.getWrappedObject().getFiles());
]]></script>
<loop item="file" index="i" filter="unique">
<list>
<var name="files" />
</list>
<body>
<template><![CDATA[
<article name="${file.getWrappedObject().getFileName()}" url="${file.getWrappedObject().getFileUrl()}" dateTime="${file.getWrappedObject().getFileDateTime()}" read="${file.getWrappedObject().getCount_read()}" comments="${file.getWrappedObject().getCount_comment()}"/>
]]></template>
</body>
</loop>
<template><![CDATA[ </archive> ]]></template>
</body>
</loop>
<![CDATA[ </archives></user> ]]>
</file>
</config>
functions.xml
<?xml version="1.0" encoding="UTF-8"?> <config> <!-- Download multi-page list of items. @param pageUrl - URL of starting page @param itemXPath - XPath expression to obtain single item in the list @param nextXPath - XPath expression to URL for the next page @param maxloops - maximum number of pages downloaded @return list of all downloaded items --> <function name="download-multipage-list"> <return> <while condition="${pageUrl.toString().length() != 0}" maxloops="${maxloops}" index="i"> <empty> <var-def name="content"> <html-to-xml> <http url="${pageUrl}" /> </html-to-xml> </var-def> <var-def name="nextLinkUrl"> <xpath expression="${nextXPath}"> <var name="content" /> </xpath> </var-def> <var-def name="pageUrl"> <template>${sys.fullUrl(pageUrl.toString(), nextLinkUrl.toString())}</template> </var-def> </empty> <xpath expression="${itemXPath}"> <var name="content" /> </xpath> </while> </return> </function> <function name="regexp"> <script><![CDATA[ ]]></script> <return> <regexp> <regexp-pattern><var name="rule" /></regexp-pattern> <regexp-source> <xpath expression="${itemPath}"> <var name="content" /> </xpath> </regexp-source> <regexp-result> <template>${_0}</template> </regexp-result> </regexp> </return> </function> </config>
output
<user name="garyyding"> <categories> <category name="OpenGL" url="http://blog.csdn.net/garyyding/article/category/950136"> <article name="Learn JOGL" url="http://blog.csdn.net/garyyding/article/details/7063265" dateTime="2011-12-12 13:29" read="30" comments="0"> <description> wei495715356 has some introduce for Nehe http://www.iteye.com/topic/671095 HCQmaker has some course for OpenGL http://hcqmaker.iteye.com/blog/241320 wjyjimy has some course for OpenGL h... </description> </article> <article name="Learn Java OpenGL from NeHe ( Jogl 1.1.2)" url="http://blog.csdn.net/garyyding/article/details/7063205" dateTime="2011-12-12 13:15" read="53" comments="0"> <description> It is a good place to learn OpenGL (Java ) NeHe http://nehe.gamedev.net/ NeHe demo explaination Lessons 01 - 05Lessons 06 - 10Lessons 11 - 15Lessons 16 - 20Lessons 21 - 25Lessons 26 - 30... </description> </article> </category> <category name="Game" url="http://blog.csdn.net/garyyding/article/category/950137"> <article name="Learn Java OpenGL from NeHe ( Jogl 1.1.2)" url="http://blog.csdn.net/garyyding/article/details/7063205" dateTime="2011-12-12 13:15" read="53" comments="0"> <description> It is a good place to learn OpenGL (Java ) NeHe http://nehe.gamedev.net/ NeHe demo explaination Lessons 01 - 05Lessons 06 - 10Lessons 11 - 15Lessons 16 - 20Lessons 21 - 25Lessons 26 - 30... </description> </article> </category> <category name="Other" url="http://blog.csdn.net/garyyding/article/category/951538"> <article name="free svn repository -- www.assembla.com" url="http://blog.csdn.net/garyyding/article/details/7162293" dateTime="2011-12-29 15:43" read="22" comments="0"> <description> A article has introduced some free svn repositories. https://www.assembla.com/user/one_page_signup/software_developers_integrated?space_type=catalog I have tried assembla. It is good. My repo... </description> </article> <article name="My PMP" url="http://blog.csdn.net/garyyding/article/details/7069177" dateTime="2011-12-14 09:37" read="22" comments="0"> <description> https://my.pmi.org/ login as gary.ding report PDU https://ccrs.pmi.org/Certificants/ClaimPDU.aspx Input provider code, you can select course, for example 2858,2854... </description> </article> </category> </categories><archives> <archives> <archive name="2012年03月" url="http://blog.csdn.net/garyyding/article/month/2012/03"> <article name="Web-Harvest(1)" url="http://blog.csdn.net/garyyding/article/details/7409845" dateTime="2012-03-30 09:14" read="8" comments="0"/> <article name="Use Web-Harvest to data-extract from www.vdisk.cn" url="http://blog.csdn.net/garyyding/article/details/7361178" dateTime="2012-03-16 15:34" read="24" comments="0"/> </archive> <archive name="2011年12月" url="http://blog.csdn.net/garyyding/article/month/2011/12"> <article name="free svn repository -- www.assembla.com" url="http://blog.csdn.net/garyyding/article/details/7162293" dateTime="2011-12-29 15:43" read="22" comments="0"/> <article name="My PMP" url="http://blog.csdn.net/garyyding/article/details/7069177" dateTime="2011-12-14 09:37" read="22" comments="0"/> <article name="Learn JOGL" url="http://blog.csdn.net/garyyding/article/details/7063265" dateTime="2011-12-12 13:29" read="30" comments="0"/> <article name="Learn Java OpenGL from NeHe ( Jogl 1.1.2)" url="http://blog.csdn.net/garyyding/article/details/7063205" dateTime="2011-12-12 13:15" read="53" comments="0"/> </archive> <archive name="2011年10月" url="http://blog.csdn.net/garyyding/article/month/2011/10"> <article name="Install Android4.0" url="http://blog.csdn.net/garyyding/article/details/6890177" dateTime="2011-10-20 14:24" read="87" comments="0"/> </archive> <archive name="2010年04月" url="http://blog.csdn.net/garyyding/article/month/2010/04"> <article name="Mobile phone development" url="http://blog.csdn.net/garyyding/article/details/5508757" dateTime="2010-04-20 22:39" read="23" comments="0"/> </archive> <archive name="2009年11月" url="http://blog.csdn.net/garyyding/article/month/2009/11"> <article name="google app engine" url="http://blog.csdn.net/garyyding/article/details/4866348" dateTime="2009-11-24 20:41" read="49" comments="0"/> <article name="JbossServer5.1GA isolated for 多个 EAR" url="http://blog.csdn.net/garyyding/article/details/4828094" dateTime="2009-11-18 13:50" read="39" comments="0"/> </archive> <archive name="2008年12月" url="http://blog.csdn.net/garyyding/article/month/2008/12"> <article name="关于appfuse" url="http://blog.csdn.net/garyyding/article/details/3514216" dateTime="2008-12-14 09:54" read="40" comments="0"/> </archive> </archives></user>