Web-Harvest(1)

www.vdisk.cn ( for example http://www.vdisk.cn/msdiaoxian) has the following xpath


======link of ALLFILES
<div class='tag'><a href='?tag=ALLFILES&p=1' title='ALLFILES(339)'>ALLFILES(339)</a></div>
"//a[@href[contains(., '?tag=ALLFILES')]]/@href[1]"

======each page
<a href='?tag=ALLFILES&p=2' title='see'>2</a>
"//a[@href[contains(., '?tag=ALLFILES')]]/@href[1]"

======each category
<div class='tag'><a href='?tag=%E9%98%85%E8%AF%BB&p=1' title='read(2)'>read(2)</a></div>

===== each file
<table width="100%" class="ft"><tbody>
<td width="*">
<a href="/down/index/9405067" target="_blank">s1.0.2.apk</a>
</td>
<td width="130" align="right"><font style="color:#ccc">1.08 MB</font></td>

"//table[@class='ft']/tbody/tr"
for filename="tr/td[1]/a/text()"
for file url="tr/td[1]/a/@href[1]"
for file size="tr/td[2]//font/text()"

===== get file name/url, no include file size
<a href="/down/index/9405067" target="_blank">s1.0.2.apk</a>
"//a[@href[contains(., '/down/index/')]]"


So prepare xml

<?xml version="1.0" encoding="UTF-8"?>

<!-- Expects following initial variable: search - search expression -->

<config charset="UTF-8">
	<!-- 
	<include path="functions.xml" />
	 -->
	 
	<!-- defines search keyword and start URL -->
	<var-def name="search" overwrite="false">
		<template>${searchResult.getWrappedObject().user}</template>
	</var-def>
	<var-def name="searchResultObject" overwrite="false">
		<template>${searchResult.getWrappedObject()}</template>
	</var-def>
	<var-def name="currentUser" overwrite="false">
		<template>${searchResult.getWrappedObject().user}</template>
	</var-def>
	<var-def name="targetWebsite" overwrite="false">
		<template>${searchResult.getWrappedObject().rootWebSite}/${searchResult.getWrappedObject().user}
		</template>
	</var-def>
	<var-def name="rootWebsite" overwrite="false">
		<template>${searchResult.getWrappedObject().rootWebSite}</template>
	</var-def>

	<var-def name="urls">
		<xpath expression="//div[@class='tag']/a">
			<html-to-xml>
				<http url="${targetWebsite}" />
			</html-to-xml>
		</xpath>
	</var-def>
	<loop item="url" index="i" filter="unique">
		<list>
			<var name="urls" />
		</list>
		<body>
			<var-def name="title">
				<regexp>
					<regexp-pattern>(.*)\(.*</regexp-pattern>
					<regexp-source>
						<xpath expression="//@title">
							<var name="url" />
						</xpath>
					</regexp-source>
					<regexp-result>
						<template>${_1}</template>
					</regexp-result>
				</regexp>
			</var-def>
			<var-def name="u">
				<xpath expression="//@href">
					<var name="url" />
				</xpath>
			</var-def>

			<script><![CDATA[			
                 Object o=searchResult.getWrappedObject();               
                 o.addCategory(title.toString(),u.toString());                                
             ]]></script>

			<var-def name="urls1">
				<xpath expression="//table[@class='ft']/tbody/tr">
					<html-to-xml>
						<http url="${targetWebsite}/${u}/" />
					</html-to-xml>
				</xpath>
			</var-def>
			<loop item="url1" index="j" filter="unique">
				<list>
					<var name="urls1" />
				</list>
				<body>
					<var-def name="f1">
						<xpath expression="tr/td[1]/a/text()">
							<var name="url1" />
						</xpath>
					</var-def>
					<var-def name="f2">
						<xpath expression="tr/td[1]/a/@href[1]">
							<var name="url1" />
						</xpath>
					</var-def>
					<var-def name="f3">
						<xpath expression="tr/td[2]//font/text()">
							<var name="url1" />

						</xpath>
					</var-def>

					<script><![CDATA[             

             Object o=searchResult.getWrappedObject();               
             o.addFile(f1.toString(),f3.toString(),f2.toString(),title.toString());  
             ]]></script>
				</body>
			</loop>

		</body>
	</loop>

	<script><![CDATA[   
    SetContextVar("categories", searchResult.getWrappedObject().getCategories());                                            
  ]]></script>

	<file action="write" path="${currentUser}.xml" charset="UTF-8">
		<template>
            <![CDATA[ <user name="${currentUser}"> ]]>
		</template>
		<loop item="category" index="i" filter="unique">
			<list>
				<var name="categories" />
			</list>
			<body>
				<template>
            <![CDATA[ <category name="${category.getWrappedObject().getName()}" url="${targetWebsite}/${category.getWrappedObject().getUrl()}"> ]]>
				</template>

				<script><![CDATA[   
           SetContextVar("files", category.getWrappedObject().getFiles());                                            
        ]]></script>

				<loop item="file" index="i" filter="unique">
					<list>
						<var name="files" />
					</list>
					<body>
						<template>
            <![CDATA[ <file name="${file.getWrappedObject().getFileName()}" url="${rootWebsite}${file.getWrappedObject().getFileUrl()}" size="${file.getWrappedObject().getFileSize()}"/> ]]>
						</template>

					</body>
				</loop>

				<template>
            <![CDATA[ </category> ]]>
				</template>

			</body>
		</loop> 
        <![CDATA[ </user> ]]>
	</file>

</config>




The result in file

<user name="msdiaoxian">
<category name="阅读" url="http://www.vdisk.cn/msdiaoxian/?tag=%E9%98%85%E8%AF%BB&p=1">
<file name="福昕PDF阅读4.3.1.218.7z" url="http://www.vdisk.cn/down/index/6831851" size="4.50 MB"/>
<file name="PDFReader7.7z" url="http://www.vdisk.cn/down/index/5739663" size="5.54 MB"/>
</category>
<category name="输入工具" url="http://www.vdisk.cn/msdiaoxian/?tag=%E8%BE%93%E5%85%A5%E5%B7%A5%E5%85%B7&p=1">
<file name="QQ五笔0.1倍QQ升级加速_14_303.exe" url="http://www.vdisk.cn/down/index/6108556" size="10.37 MB"/>
<file name="极点五笔扩展牛津英汉辞典.rar" url="http://www.vdisk.cn/down/index/5992907" size="4.98 MB"/>
<file name="QQ五笔QQ0.1倍加速QQ升级Wubi_Setup_13_283.exe" url="http://www.vdisk.cn/down/index/5971172" size="10.72 MB"/>
<file name="小鸭五笔 V3.2.1016官方安装版.EXE" url="http://www.vdisk.cn/down/index/5971149" size="2.27 MB"/>
<file name="搜狗拼音输入法 V5.1.5272 官方版.EXE" url="http://www.vdisk.cn/down/index/5246711" size="17.41 MB"/>
</category>
<category name="软件工程" url="http://www.vdisk.cn/msdiaoxian/?tag=%E8%BD%AF%E4%BB%B6%E5%B7%A5%E7%A8%8B&p=1">
<file name="现代软件工程 张家浩-ppt.rar" url="http://www.vdisk.cn/down/index/5442871" size="13.67 MB"/>
<file name="实用软件工程教程资源.zip" url="http://www.vdisk.cn/down/index/5442868" size="5.21 MB"/>
<file name="软件工程――原理、方法和工具资源.rar" url="http://www.vdisk.cn/down/index/5442862" size="5.88 MB"/>
<file name="软件工程2010.rar" url="http://www.vdisk.cn/down/index/5442852" size="5.53 MB"/>
</category>
<category name="课程作业" url="http://www.vdisk.cn/msdiaoxian/?tag=%E8%AF%BE%E7%A8%8B%E4%BD%9C%E4%B8%9A&p=1">
<file name="数据库习题.7z" url="http://www.vdisk.cn/down/index/8416008" size="40.16 KB"/>
<file name="概率论复习.7z" url="http://www.vdisk.cn/down/index/8189342" size="275.92 KB"/>
<file name="软件工程复习题.7z" url="http://www.vdisk.cn/down/index/8154440" size="141.97 KB"/>
<file name="CPU报告书.doc" url="http://www.vdisk.cn/down/index/6079883" size="0.70 MB"/>
<file name="quartusII设计简单CPU.rar" url="http://www.vdisk.cn/down/index/6063094" size="3.43 MB"/>
<file name="HHH_Work.rar" url="http://www.vdisk.cn/down/index/5654202" size="2.24 MB"/>
<file name="EClock.7z" url="http://www.vdisk.cn/down/index/5651336" size="346.20 KB"/>
<file name="复件 msdx.rar" url="http://www.vdisk.cn/down/index/5555606" size="3.08 MB"/>
<file name="HHH_Work.7z" url="http://www.vdisk.cn/down/index/5542889" size="469.93 KB"/>
<file name="HHH_Work.7z" url="http://www.vdisk.cn/down/index/5513882" size="351.92 KB"/>
<file name="add.7z" url="http://www.vdisk.cn/down/index/5513718" size="204.45 KB"/>
<file name="录像2仿真.7z" url="http://www.vdisk.cn/down/index/5512301" size="1.01 MB"/>
<file name="录像1.7z" url="http://www.vdisk.cn/down/index/5511173" size="1.12 MB"/>
<file name="adder.7z" url="http://www.vdisk.cn/down/index/5511170" size="134.93 KB"/>
<file name="half-add.7z" url="http://www.vdisk.cn/down/index/5441566" size="143.06 KB"/>
<file name="Chess.rar" url="http://www.vdisk.cn/down/index/5418558" size="0.86 MB"/>
<file name="TestLogin.java" url="http://www.vdisk.cn/down/index/5418281" size="1142 B"/>
</category>






  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值