Ruby多进程并行抓取网页

最新推荐文章于 2019-04-24 15:14:16 发布

changandaxue

最新推荐文章于 2019-04-24 15:14:16 发布

阅读量158

点赞数

分类专栏： ruby on rails 文章标签： Ruby 多进程并行抓取网页

ruby on rails 专栏收录该内容

33 篇文章 0 订阅

订阅专栏

想研究一下各大网站首页有多少个链接，于是用Ruby写了一个脚本，用Beanstalk消息队列，把任务放入Beanstalk中，同时开启多个子进程，如20个，并行运行，同时抓取网页，先抓取完成的进程再次读取Beanstalk任务，继续执行，直到没有任务后，进程退出，主进程等待所有子进程退出后，打印抓取的消息。
Ruby的多线程实际是只能跑在单cpu上，并且同一时刻cpu只处理一个线程，所以采用多进程抓取，消息队列采用最简单的Beanstalk，需要安装Beanstalkd服务。
示例代码主要抓取电商网站测试。
代码可以直接运行，需要ruby 1.9版本，1.9一下要稍微修改才能运行。

[代码] [Ruby]代码

 
    001#!/usr/bin/env ruby
 
    002#encoding: UTF-8
 
    003 
 
    004# 抓取每一个站点的首页链接数量
 
    005# require 'rubygems'            # 1.8.7
 
require 'ap'                # gem install awesome_print 
 
require 'json' 
 
require 'net/http' 
 
require 'nokogiri'          # gem install nokogiri 
 
require 'forkmanager'       # gem install parallel-forkmanager 
 
require 'beanstalk-client'  # gem install beanstalk-client 
 
    012 
 
class MultipleCrawler 
 
    014 
 
    class Crawler 
 
        def initialize(user_agent, redirect_limit=1) 
 
            @user_agent = user_agent 
 
            @redirect_limit = redirect_limit 
 
            @timeout = 20 
 
        end 
 
        attr_accessor :user_agent, :redirect_limit, :timeout 
 
         
 
        def fetch(website) 
 
            print "Pid:#{Process.pid}, fetch: #{website}\n" 
 
            redirect, url = @redirect_limit, website 
 
            start_time = Time.now 
 
            redirecting = false 
 
            begin 
 
                begin 
 
                    uri = URI.parse(url) 
 
                    req = Net::HTTP::Get.new(uri.path) 
 
                    req.add_field('User-Agent', @user_agent) 
 
                    res = Net::HTTP.start(uri.host, uri.port) do |http| 
 
                        http.read_timeout = @timeout 
 
                        http.request(req) 
 
                    end 
 
                    if res.header['location'] # 遇到重定向，则url设定为location，再次抓取 
 
                        url = res.header['location'] 
 
                        redirecting = true 
 
                    end 
 
                    redirect -= 1 
 
                end while redirecting and redirect>=0 
 
                opened_time = (Time.now - start_time).round(4) # 统计打开网站耗时 
 
                encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0] 
 
                encoding = encoding ? encoding[0].upcase : 'GB18030' 
 
                html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8') 
 
                doc = Nokogiri::HTML(html) 
 
                processed_time = (Time.now - start_time - opened_time).round(4)# 统计分析链接耗时, 1.8.7, ('%.4f' % float).to_f 替换 round(4) 
 
                [opened_time, processed_time, doc.css('a[@href]').size, res.header['server']] 
 
            rescue =>e 
 
                e.message  
 
            end 
 
        end 
 
    end 
 
     
 
    def initialize(websites, beanstalk_jobs, pm_max=1, user_agent='', redirect_limit=1) 
 
        @websites = websites                # 网址数组 
 
        @beanstalk_jobs = beanstalk_jobs    # beanstalk服务器地址和管道参数 
 
        @pm_max = pm_max                    # 最大并行运行进程数 
 
        @user_agent = user_agent            # user_agent 伪装成浏览器访问 
 
        @redirect_limit = redirect_limit    # 允许最大重定向次数 
 
         
 
        @ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道 
 
    end 
 
     
 
    attr_accessor :user_agent, :redirect_limit 
 
     
 
    def init_beanstalk_jobs # 准备beanstalk任务 
 
        beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs) 
 
        #清空beanstalk的残留消息队列 
 
        begin 
 
            while job = beanstalk.reserve(0.1) 
 
                job.delete 
 
            end 
 
        rescue Beanstalk::TimedOut 
 
            print "Beanstalk queues cleared!\n" 
 
        end 
 
        @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈 
 
        beanstalk.close 
 
        rescue => e 
 
            puts e 
 
            exit 
 
    end 
 
     
 
    def process_jobs # 处理任务 
 
        start_time = Time.now 
 
        pm = Parallel::ForkManager.new(@pm_max) 
 
        @pm_max.times do |i| 
 
            pm.start(i) and next # 启动后，立刻 next 不会等待进程执行完，这样才可以并行运算 
 
            beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs) 
 
            @ipc_reader.close    # 关闭读取管道，子进程只返回数据 
 
            loop{ 
 
                begin 
 
                    job = beanstalk.reserve(0.1) # 检测超时为0.1秒，因为任务以前提前压栈 
 
                    index = job.body 
 
                    job.delete 
 
                    website = @websites[index.to_i] 
 
                    result = Crawler.new(@user_agent).fetch(website) 
 
                    @ipc_writer.puts( ({website=>result}).to_json ) 
 
                rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt 
 
                    break 
 
                end 
 
            } 
 
            @ipc_writer.close 
 
            pm.finish(0)    
 
        end 
 
        @ipc_writer.close 
 
        begin 
 
            pm.wait_all_children        # 等待所有子进程处理完毕 
 
        rescue SystemExit, Interrupt    # 遇到中断，打印消息 
 
            print "Interrupt wait all children!\n" 
 
        ensure 
 
            results = read_results 
 
            ap results, :indent => -4 , :index=>false # 打印处理结果 
 
            print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n" 
 
        end 
 
    end 
 
     
 
    def read_results # 通过管道读取子进程抓取返回的数据 
 
        results = {} 
 
        while result = @ipc_reader.gets 
 
            results.merge! JSON.parse(result) 
 
        end 
 
        @ipc_reader.close 
 
        results 
 
    end 
 
     
 
    def run # 运行入口 
 
        init_beanstalk_jobs 
 
        process_jobs 
 
    end 
 
    132end
 
    133 
 
    134websites = %w(
 
    135http://www.51buy.com/ http://www.360buy.com/ http://www.tmall.com/ http://www.taobao.com/
 
    136http://china.alibaba.com/ http://www.paipai.com/ http://shop.qq.com/ http://www.lightinthebox.com/
 
    137http://www.amazon.cn/ http://www.newegg.com.cn/ http://www.vancl.com/ http://www.yihaodian.com/
 
    138http://www.dangdang.com/ http://www.m18.com/ http://www.suning.com/ http://www.hstyle.com/
 
    139)
 
beanstalk_jobs = [['localhost:11300'],'crawler-jobs'] 
 
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0' 
 
pm_max = 10 
 
    143 
 
MultipleCrawler.new(websites, beanstalk_jobs, pm_max, user_agent).run

changandaxue

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Ruby多进程并行抓取网页

想研究一下各大网站首页有多少个链接，于是用Ruby写了一个脚本，用Beanstalk消息队列，把任务放入Beanstalk中，同时开启多个子进程，如20个，并行运行，同时抓取网页，先抓取完成的进程再次读取Beanstalk任务，继续执行，直到没有任务后，进程退出，主进程等待所有子进程退出后，打印抓取的消息。 Ruby的多线程实际是只能跑在单cpu上，并且同一时刻cpu只处理一个线程，所以...
复制链接

扫一扫