$KCODE = "e"
require "watir"
$path = "http://www.baidu.com/"
$ary = Array.new
file = File.open("D:/20130409-专家信息系统-资源采集/专家清单.txt","r")
array = file.readlines
$fw = File.new("D:/专家清单-续断.txt","a")
for n in 0...array.size
$ary.push(array[n].gsub(/<#>|\n/," ").strip)
end
#~ #打开ie
def ie_open
$ie = Watir::IE.new
$ie.goto($path)
end
#~ #从第id次开始采集专家信息
def ie_write(id)
for j in id.to_i...$ary.size
$ie.text_field(:name,"wd").set $ary[j]
$ie.button(:id,"su").click
i = $ie.tables.to_a.size-2
$fw.puts $ary[j]
$fw.puts "----------------------------------"
for n in 1..i
if n <= 2
table = $ie.table(:id,"#{n}") ##和table = $ie.tables[i]相同
#~ table = $ie.tables[n]
array = table[1][1].text.split(/\r\n/)
title = array[0]
abs = ""
for m in 1...array.size-1
abs = abs+array[m]
end
ary = array[array.size-1].to_s.split(/\s/)
url = ary[0]
date = ary.to_s.scan(/\d{4}-\d{1,2}-\d{1,2}/).to_s
$fw.puts "[title]"+title
$fw.puts "[abs]"+abs
$fw.puts "[url]"+url
$fw.puts "[date]#{date}"
$fw.puts "===================================="
end
end
$note_file.puts j.to_s
end
end
#~ #初始化采集
def start
ie_open
$fw = File.new("D:/专家清单-续断.txt","a")
$note_file = File.new("日志文件.txt","a")
arr = File.new("日志文件.txt").readlines.pop.to_s.scan(/\d{1,}/)
if arr==[]
arr[0] = 0
else
arr[0]=arr[0].to_i+1
end
ie_write(arr[0])
end
#~ # 自动重新启动
while true
$a_run = true
a = Thread.new do
begin
start
rescue
puts "=====重新启动====="
puts Time.now
$a_run = false
$fw.close
$note_file.close
$ie.close #关闭所有的IE窗口
puts "已经关闭全部IE进程!等待重新启动···"
end
#~ puts "======="
#~ puts Time.now
end
while true
abc = 1
break if $a_run == false
sleep(3)
abc = abc +1
break if abc > 10
end
sleep(10)
end
要求:
1.利用ruby+watir进行互联网海量资源采集;
2.模拟人工方式,动态输入关键字在百度中搜索相关文档内容;
3.解析网页内容并抽取相关信息,采集样例:
扩展:
程序实现断点续传问题(提示:添加日志文件,纪录断点的位置,利用线程自动重启网页)