想研究一下各大网站首页有多少个链接,于是用Ruby写了一个脚本,用Beanstalk消息队列,把任务放入Beanstalk中,同时开启多个子进程,如20个,并行运行,同时抓取网页,先抓取完成的进程再次读取Beanstalk任务,继续执行,直到没有任务后,进程退出,主进程等待所有子进程退出后,打印抓取的消息。
Ruby的多线程实际是只能跑在单cpu上,并且同一时刻cpu只处理一个线程,所以采用多进程抓取,消息队列采用最简单的Beanstalk,需要安装Beanstalkd服务。
示例代码主要抓取电商网站测试。
代码可以直接运行,需要ruby 1.9版本,1.9一下要稍微修改才能运行。
[代码] [Ruby]代码
001 | #!/usr/bin/env ruby |
002 | #encoding: UTF-8 |
003 |
004 | # 抓取每一个站点的首页链接数量 |
005 | # require 'rubygems' # 1.8.7 |
006 | require 'ap' # gem install awesome_print |
007 | require 'json' |
008 | require 'net/http' |
009 | require 'nokogiri' # gem install nokogiri |
010 | require 'forkmanager' # gem install parallel-forkmanager |
011 | require 'beanstalk-client' # gem install beanstalk-client |
012 |
013 | class MultipleCrawler |
014 |
015 |
class Crawler |
016 |
def initialize(user_agent, redirect_limit= 1 ) |
017 |
@user_agent = user_agent |
018 |
@redirect_limit = redirect_limit |
019 |
@timeout = 20 |
020 |
end |
021 |
attr_accessor :user_agent , :redirect_limit , :timeout |
022 |
|
023 |
def fetch(website) |
024 |
print "Pid:#{Process.pid}, fetch: #{website}\n" |
025 |
redirect, url = @redirect_limit , website |
026 |
start_time = Time .now |
027 |
redirecting = false |
028 |
begin |
029 |
begin |
030 |
uri = URI .parse(url) |
031 |
req = Net:: HTTP ::Get. new (uri.path) |
032 |
req.add_field( 'User-Agent' , @user_agent ) |
033 |
res = Net:: HTTP .start(uri.host, uri.port) do |http| |
034 |
http.read_timeout = @timeout |
035 |
http.request(req) |
036 |
end |
037 |
if res.header[ 'location' ] # 遇到重定向,则url设定为location,再次抓取 |
038 |
url = res.header[ 'location' ] |
039 |
redirecting = true |
040 |
end |
041 |
redirect -= 1 |
042 |
end while redirecting and redirect>= 0 |
043 |
opened_time = ( Time .now - start_time).round( 4 ) # 统计打开网站耗时 |
044 |
encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[ 0 ] |
045 |
encoding = encoding ? encoding[ 0 ].upcase : 'GB18030' |
046 |
html = 'UTF-8' ==encoding ? res.body : res.body.force_encoding( 'GB2312' ==encoding || 'GBK' ==encoding ? 'GB18030' : encoding).encode( 'UTF-8' ) |
047 |
doc = Nokogiri:: HTML (html) |
048 |
processed_time = ( Time .now - start_time - opened_time).round( 4 ) # 统计分析链接耗时, 1.8.7, ('%.4f' % float).to_f 替换 round(4) |
049 |
[opened_time, processed_time, doc.css( 'a[@href]' ).size, res.header[ 'server' ]] |
050 |
rescue =>e |
051 |
e.message |
052 |
end |
053 |
end |
054 |
end |
055 |
|
056 |
def initialize(websites, beanstalk_jobs, pm_max= 1 , user_agent= '' , redirect_limit= 1 ) |
057 |
@websites = websites # 网址数组 |
058 |
@beanstalk_jobs = beanstalk_jobs # beanstalk服务器地址和管道参数 |
059 |
@pm_max = pm_max # 最大并行运行进程数 |
060 |
@user_agent = user_agent # user_agent 伪装成浏览器访问 |
061 |
@redirect_limit = redirect_limit # 允许最大重定向次数 |
062 |
|
063 |
@ipc_reader , @ipc_writer = IO .pipe # 缓存结果的 ipc 管道 |
064 |
end |
065 |
|
066 |
attr_accessor :user_agent , :redirect_limit |
067 |
|
068 |
def init_beanstalk_jobs # 准备beanstalk任务 |
069 |
beanstalk = Beanstalk::Pool. new (* @beanstalk_jobs ) |
070 |
#清空beanstalk的残留消息队列 |
071 |
begin |
072 |
while job = beanstalk.reserve( 0 . 1 ) |
073 |
job.delete |
074 |
end |
075 |
rescue Beanstalk::TimedOut |
076 |
print "Beanstalk queues cleared!\n" |
077 |
end |
078 |
@websites .size.times{|i| beanstalk.put(i)} # 将所有的任务压栈 |
079 |
beanstalk.close |
080 |
rescue => e |
081 |
puts e |
082 |
exit |
083 |
end |
084 |
|
085 |
def process_jobs # 处理任务 |
086 |
start_time = Time .now |
087 |
pm = Parallel::ForkManager. new ( @pm_max ) |
088 |
@pm_max .times do |i| |
089 |
pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算 |
090 |
beanstalk = Beanstalk::Pool. new (* @beanstalk_jobs ) |
091 |
@ipc_reader .close # 关闭读取管道,子进程只返回数据 |
092 |
loop{ |
093 |
begin |
094 |
job = beanstalk.reserve( 0 . 1 ) # 检测超时为0.1秒,因为任务以前提前压栈 |
095 |
index = job.body |
096 |
job.delete |
097 |
website = @websites [index.to_i] |
098 |
result = Crawler. new ( @user_agent ).fetch(website) |
099 |
@ipc_writer .puts( ({website=>result}).to_json ) |
100 |
rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt |
101 |
break |
102 |
end |
103 |
} |
104 |
@ipc_writer .close |
105 |
pm.finish( 0 ) |
106 |
end |
107 |
@ipc_writer .close |
108 |
begin |
109 |
pm.wait_all_children # 等待所有子进程处理完毕 |
110 |
rescue SystemExit, Interrupt # 遇到中断,打印消息 |
111 |
print "Interrupt wait all children!\n" |
112 |
ensure |
113 |
results = read_results |
114 |
ap results, :indent => - 4 , :index => false # 打印处理结果 |
115 |
print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n" |
116 |
end |
117 |
end |
118 |
|
119 |
def read_results # 通过管道读取子进程抓取返回的数据 |
120 |
results = {} |
121 |
while result = @ipc_reader .gets |
122 |
results.merge! JSON .parse(result) |
123 |
end |
124 |
@ipc_reader .close |
125 |
results |
126 |
end |
127 |
|
128 |
def run # 运行入口 |
129 |
init_beanstalk_jobs |
130 |
process_jobs |
131 |
end |
132 | end |
133 |
134 | websites = %w( |
135 | http://www.51buy.com/ http://www.360buy.com/ http://www.tmall.com/ http://www.taobao.com/ |
136 | http://china.alibaba.com/ http://www.paipai.com/ http://shop.qq.com/ http://www.lightinthebox.com/ |
137 | http://www.amazon.cn/ http://www.newegg.com.cn/ http://www.vancl.com/ http://www.yihaodian.com/ |
138 | http://www.dangdang.com/ http://www.m18.com/ http://www.suning.com/ http://www.hstyle.com/ |
139 | ) |
140 | beanstalk_jobs = [[ 'localhost:11300' ], 'crawler-jobs' ] |
141 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0' |
142 | pm_max = 10 |
143 |
144 | MultipleCrawler. new (websites, beanstalk_jobs, pm_max, user_agent).run |