Ruby多进程并行抓取网页

最新推荐文章于 2022-04-21 12:29:09 发布

tiger925

最新推荐文章于 2022-04-21 12:29:09 发布

阅读量567

点赞数

想研究一下各大网站首页有多少个链接，于是用Ruby写了一个脚本，用Beanstalk消息队列，把任务放入Beanstalk中，同时开启多个子进程，如20个，并行运行，同时抓取网页，先抓取完成的进程再次读取Beanstalk任务，继续执行，直到没有任务后，进程退出，主进程等待所有子进程退出后，打印抓取的消息。
Ruby的多线程实际是只能跑在单cpu上，并且同一时刻cpu只处理一个线程，所以采用多进程抓取，消息队列采用最简单的Beanstalk，需要安装Beanstalkd服务。
示例代码主要抓取电商网站测试。
代码可以直接运行，需要ruby 1.9版本，1.9一下要稍微修改才能运行。

[代码] [Ruby]代码

 
#!/usr/bin/env ruby
 
#encoding: UTF-8
 
 
 
# 抓取每一个站点的首页链接数量
 
# require 'rubygems' # 1.8.7
 
require 'ap'  # gem install awesome_print
 
require 'json'
 
require 'net/http'
 
require 'nokogiri'  # gem install nokogiri
 
require 'forkmanager'  # gem install parallel-forkmanager
 
require 'beanstalk-client'  # gem install beanstalk-client
 
 
 
class  MultipleCrawler
 
 
 
class  Crawler
 
def  initialize(user_agent, redirect_limit=1)
 
@user_agent  = user_agent
 
@redirect_limit  = redirect_limit
 
@timeout  = 20
 
end
 
attr_accessor :user_agent, :redirect_limit, :timeout
 

 
def  fetch(website)
 
print "Pid:#{Process.pid}, fetch: #{website}\n"
 
redirect, url = @redirect_limit, website
 
start_time = Time.now
 
redirecting = false
 
begin
 
begin
 
uri = URI.parse(url)
 
req = Net::HTTP::Get.new(uri.path)
 
req.add_field('User-Agent', @user_agent)
 
res = Net::HTTP.start(uri.host, uri.port) do  |http|
 
http.read_timeout = @timeout
 
http.request(req)
 
end
 
if  res.header['location'] # 遇到重定向，则url设定为location，再次抓取
 
url = res.header['location']
 
redirecting = true
 
end
 
redirect -= 1
 
end  while  redirecting and  redirect>=0
 
opened_time = (Time.now - start_time).round(4) # 统计打开网站耗时
 
encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
 
encoding = encoding ? encoding[0].upcase : 'GB18030'
 
html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030'  : encoding).encode('UTF-8')
 
doc = Nokogiri::HTML(html)
 
processed_time = (Time.now - start_time - opened_time).round(4)# 统计分析链接耗时, 1.8.7, ('%.4f' % float).to_f 替换 round(4)
 
[opened_time, processed_time, doc.css('a[@href]').size, res.header['server']]
 
rescue  =>e
 
e.message 
 
end
 
end
 
end
 

 
def  initialize(websites, beanstalk_jobs, pm_max=1, user_agent='', redirect_limit=1)
 
@websites  = websites # 网址数组
 
@beanstalk_jobs  = beanstalk_jobs # beanstalk服务器地址和管道参数
 
@pm_max  = pm_max # 最大并行运行进程数
 
@user_agent  = user_agent # user_agent 伪装成浏览器访问
 
@redirect_limit  = redirect_limit # 允许最大重定向次数
 

 
@ipc_reader, @ipc_writer  = IO.pipe # 缓存结果的 ipc 管道
 
end
 

 
attr_accessor :user_agent, :redirect_limit
 

 
def  init_beanstalk_jobs # 准备beanstalk任务
 
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
 
#清空beanstalk的残留消息队列
 
begin
 
while  job = beanstalk.reserve(0.1)
 
job.delete
 
end
 
rescue  Beanstalk::TimedOut
 
print "Beanstalk queues cleared!\n"
 
end
 
@websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
 
beanstalk.close
 
rescue  => e
 
puts e
 
exit
 
end
 

 
def  process_jobs # 处理任务
 
start_time = Time.now
 
pm = Parallel::ForkManager.new(@pm_max)
 
@pm_max.times do  |i|
 
pm.start(i) and  next  # 启动后，立刻 next 不会等待进程执行完，这样才可以并行运算
 
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
 
@ipc_reader.close # 关闭读取管道，子进程只返回数据
 
loop{
 
begin
 
job = beanstalk.reserve(0.1) # 检测超时为0.1秒，因为任务以前提前压栈
 
index = job.body
 
job.delete
 
website = @websites[index.to_i]
 
result = Crawler.new(@user_agent).fetch(website)
 
@ipc_writer.puts( ({website=>result}).to_json )
 
rescue  Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
 
break
 
end
 
}
 
@ipc_writer.close
 
pm.finish(0) 
 
end
 
@ipc_writer.close
 
begin
 
pm.wait_all_children # 等待所有子进程处理完毕
 
rescue  SystemExit, Interrupt # 遇到中断，打印消息
 
print "Interrupt wait all children!\n"
 
ensure
 
results = read_results
 
ap results, :indent  => -4  , :index=>false  # 打印处理结果
 
print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n"
 
end
 
end
 

 
def  read_results # 通过管道读取子进程抓取返回的数据
 
results = {}
 
while  result = @ipc_reader.gets
 
results.merge! JSON.parse(result)
 
end
 
@ipc_reader.close
 
results
 
end
 

 
def  run # 运行入口
 
init_beanstalk_jobs
 
process_jobs
 
end
 
end
 
 
 
websites = %w(
 
http://www.51buy.com/ http://www.360buy.com/ http://www.tmall.com/ http://www.taobao.com/
 
http://china.alibaba.com/ http://www.paipai.com/ http://shop.qq.com/ http://www.lightinthebox.com/
 
http://www.amazon.cn/ http://www.newegg.com.cn/ http://www.vancl.com/ http://www.yihaodian.com/
 
http://www.dangdang.com/ http://www.m18.com/ http://www.suning.com/ http://www.hstyle.com/
 
)
 
beanstalk_jobs = [['localhost:11300'],'crawler-jobs']
 
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
 
pm_max = 10
 
 
 
MultipleCrawler.new(websites, beanstalk_jobs, pm_max, user_agent).run