获取bit.edu的一个校内新闻的一页数据,将结果存放到redis中
require 'redis'
require 'httpclient'
require 'hpricot'
require 'json'
class String
def gsub_html
self.gsub(/\r|\t|\n/,"").gsub(/<(\S*?)[^>]*>.*?|<.*? \/>/,"").gsub(" ","")
end
end
@client = HTTPClient.new
$redis = Redis.new
def run
# get_list("http://www.bit.edu.cn/xww/rwfc/index.htm")
# bit_edu = $redis.lrange('bit_edu',0,-1).map{|key| JSON.parse($redis.get(key))}
# bit_edu.each do |key|
# puts "id = #{key['id']} | #{key['title']} | #{key['contribute']} | #{key['photography']} | #{key['edit']}"
# end
keys = $redis.lrange('bit_edu',0,-1)
bit_edu = $redis.mget(*keys).map{|value| JSON.parse(value)}
bit_edu.each do |key|
puts "id = #{key['id']} | #{key['title']} | #{key['contribute']} | #{key['photography']} | #{key['edit']}"
end
end
def get_list(url)
doc = Hpricot(@client.get(url).body)
doc.search('.new_con').search('a').each_with_index do |x,y|
hrf = x['href']
href = "http://www.bit.edu.cn/xww/rwfc/#{hrf}"
get_one(href,y)
end
end
def get_one(url,id)
doc = Hpricot(@client.get(url).body)
p title = doc.search('.article').search('h2').to_s.gsub_html
contribute = doc.search('.article').search('.fbt').search('font')[1].to_s.gsub_html
photography = doc.search('.article').search('.fbt').search('font')[3].to_s.gsub_html
edit = doc.search('.article').search('.fbt').search('font')[5].to_s.gsub_html
content = doc.search('.article').search('.article_con').to_s.gsub_html
tar = "http://www.bit.edu.cn/"
img = doc.search('.article').search('img').collect{|x| tar+x['src'].gsub("../","")}
res = {id:id,title:title,contribute:contribute,photography:photography,edit:edit,content:content,img:img}
$redis.set(id,res.to_json)
$redis.rpush('bit_edu',id)
end
run