这个程序写的很简单,刚接触ruby第二天写的,写于2009年12月,主要完成的功能是到斯坦福大学的网站上去收集email地址, ,默认是10个线程,策略是广度优先,$debug=true时开启调试信息。附件中包含代码和批处理文件。
运行命令为:
ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
其中参数:2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->URL
运行结果输出为文档文件emails_md[max_depth]_mp[max_pages]_[URL].txt
1.
require
'
open-uri
'
2. require ' thread '
3. # run it like this :
4. # ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
5. # regexp
6. $link_regexp = /href\=\ " [^\"]*\"/
7.$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/ #mailto:xx@xxxx "
8. $email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/ # >xx@xx<
9. $before_at = /[a-zA-Z0- 9]+[_?a-zA-Z0- 9]+/
10. $after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
11. $email_regexp=/ # {$before_at}\@#{$after_at}/ #xx@xx.xx
12.
13. # ARGV
14. if ARGV==nil||ARGV. length< 3
15. puts ' -- Command -- '
16. puts ' ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People '
17. puts ' help: 2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->url '
18. exit( 0)
19.end
20. $url=ARGV[ 2]
21. $max_depth=ARGV[ 0].to_i
22. $max_pages=ARGV[ 1].to_i
23. $fname= ' emails_md '+String( $max_depth)+ ' _mp '+String( $max_pages)+ ' _ '+ $url.gsub(/[\/\:]/, ' _ ')+ ' .txt '
24. $fname_links= ' links_md '+String( $max_depth)+ ' _mp '+String( $max_pages)+ ' _ '+ $url.gsub(/[\/\:]/, ' _ ')+ ' .txt '
25. $thread_num= 10
26. $debug=true
27.
28. $links_stack=[] # fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
29. $links_crawled=[] # [url1,url2,url3,...]
30. $emails=[] # [email1,email2,email3,...]
31.
32.class Crawl
33. def initialize url,depth
34. @url=url
35. while @url[- 1, 1]== ' / '
36. @url= @url.slice( 0, @url. length- 1)
37. end
38. @depth=depth
39. begin
40. @html= open( @url). read
41. rescue
42. @html= ''
43. end
44. end
45. def get_links
46. @html.scan( $link_regexp) do |match|
47. u=Util.format_url(match, @url)
48. if !(u==nil)&&! $links_crawled.include?(match)&& $links_stack.rassoc(match)==nil
49. $links_stack. push [ @depth,u]
50. end
51. end
52. end
53. def get_emails
54. @html.scan( $email_regexp_1) do |match|
55. match=Util.format_email(match)
56. if match!=nil&&! $emails.include?(match)
57. $emails. push match
58. msg= match+ ' , '+ @url
59. puts msg
60. Util. write( $fname,msg+ " \r\n ")
61. end
62. end
63. @html.scan( $email_regexp_2) do |match|
64. match=Util.format_email(match)
65. if match!=nil&&! $emails.include?(match)
66. $emails. push match
67. msg= match+ ' , '+ @url
68. puts msg
69. Util. write( $fname,msg+ " \r\n ")
70. end
71. end
72. end
73.end
74.
75.class Util
76. # format url
77. def Util.format_url url,f_url
78. # remove 'www-'
79. f_url=f_url.gsub(/www\-/, '')
80. url=url[ 6,url. length- 7]
81. # exclude css & js & '#'(eg http://www-cs.stanford.edu/People/faculty#Regular%20Faculty)...
82. if Util.exclude(url)==nil||url.include?( ' # ')
83. return nil
84. end
85. # full path
86. if url[ 0, 4]!= ' http '
87. while url. index( ' / ')== 0
88. url=url.slice( 1,url. length- 1)
89. end
90. url=f_url+ ' / '+url
91. end
92. return url
93. end
94. # format email
95. def Util.format_email email
96. email=email. delete( ' > '). delete( ' < '). delete( ' mailto: '). delete( ' " ').strip
97. if String( $email_regexp.match(email))== email
98. return email.downcase
99. else
100. return nil
101. end
102. end
103. # write msg to file
104. def Util. write fname,msg
105. file=File.new(fname, ' a ')
106. file<<msg
107. file. close()
108. end
109. # exclude css & js...
110. def Util.exclude str
111. ex=[ ' css ', ' js ', ' pdf ', ' jpg ']
112. ex. each do |e|
113. index=e. length+ 1
114. if str. length> index && str[- index, index]== ' . '+e
115. return nil
116. end
117. end
118. return str
119. end
120.end
121. $count= 1
122.0.upto( $max_depth) do |i|
123. if $debug
124. puts ' ~~depth-> '+String(i)
125. end
126. if i== 0
127. c=Crawl.new( $url,i+ 1)
128. c.get_links
129. c.get_emails
130. $links_crawled. push [i, $url]
131. end
132. # breadth first
133. while $links_stack. length!= 0
134. if $debug
135. puts ' ~~count-> '+String( $count)+ ' ,stack-> '+String( $links_stack. length)+ ' ,crawled-> '+String( $links_crawled. length)+ ' ,total-> '+String( $links_crawled. length+ $links_stack. length)
136. $count= $count+ 1
137. end
138. # Thread.abort_on_exception = true
139. threads = []
140. if $links_stack. length/ $thread_num>= 1
141. ts= $thread_num
142. else
143. ts= $links_stack. length% $thread_num
144. end
145. ts. times { |i|
146. threads << Thread.new(i) {
147. Mutex.new.synchronize {
148. if ( $links_crawled. length+ $links_stack. length)<= $max_pages&&i!= $max_depth
149. link= $links_stack. shift # fifo
150. if link[ 0]==i+ 1
151. # read links & emails from pages in stack
152. c=Crawl.new( link[ 1],i+ 2)
153. c.get_links
154. c.get_emails
155. $links_crawled. push link[ 1]
156. else
157. break
158. end
159. else
160. # only read emails from pages in stack
161. link= $links_stack. shift
162. c=Crawl.new( link[ 1],i+ 2)
163. c.get_emails
164. $links_crawled. push link[ 1]
165. end
166. }
167. }
168. }
169. threads. each{|t|t. join}
170. end
171.end
2. require ' thread '
3. # run it like this :
4. # ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
5. # regexp
6. $link_regexp = /href\=\ " [^\"]*\"/
7.$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/ #mailto:xx@xxxx "
8. $email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/ # >xx@xx<
9. $before_at = /[a-zA-Z0- 9]+[_?a-zA-Z0- 9]+/
10. $after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
11. $email_regexp=/ # {$before_at}\@#{$after_at}/ #xx@xx.xx
12.
13. # ARGV
14. if ARGV==nil||ARGV. length< 3
15. puts ' -- Command -- '
16. puts ' ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People '
17. puts ' help: 2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->url '
18. exit( 0)
19.end
20. $url=ARGV[ 2]
21. $max_depth=ARGV[ 0].to_i
22. $max_pages=ARGV[ 1].to_i
23. $fname= ' emails_md '+String( $max_depth)+ ' _mp '+String( $max_pages)+ ' _ '+ $url.gsub(/[\/\:]/, ' _ ')+ ' .txt '
24. $fname_links= ' links_md '+String( $max_depth)+ ' _mp '+String( $max_pages)+ ' _ '+ $url.gsub(/[\/\:]/, ' _ ')+ ' .txt '
25. $thread_num= 10
26. $debug=true
27.
28. $links_stack=[] # fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
29. $links_crawled=[] # [url1,url2,url3,...]
30. $emails=[] # [email1,email2,email3,...]
31.
32.class Crawl
33. def initialize url,depth
34. @url=url
35. while @url[- 1, 1]== ' / '
36. @url= @url.slice( 0, @url. length- 1)
37. end
38. @depth=depth
39. begin
40. @html= open( @url). read
41. rescue
42. @html= ''
43. end
44. end
45. def get_links
46. @html.scan( $link_regexp) do |match|
47. u=Util.format_url(match, @url)
48. if !(u==nil)&&! $links_crawled.include?(match)&& $links_stack.rassoc(match)==nil
49. $links_stack. push [ @depth,u]
50. end
51. end
52. end
53. def get_emails
54. @html.scan( $email_regexp_1) do |match|
55. match=Util.format_email(match)
56. if match!=nil&&! $emails.include?(match)
57. $emails. push match
58. msg= match+ ' , '+ @url
59. puts msg
60. Util. write( $fname,msg+ " \r\n ")
61. end
62. end
63. @html.scan( $email_regexp_2) do |match|
64. match=Util.format_email(match)
65. if match!=nil&&! $emails.include?(match)
66. $emails. push match
67. msg= match+ ' , '+ @url
68. puts msg
69. Util. write( $fname,msg+ " \r\n ")
70. end
71. end
72. end
73.end
74.
75.class Util
76. # format url
77. def Util.format_url url,f_url
78. # remove 'www-'
79. f_url=f_url.gsub(/www\-/, '')
80. url=url[ 6,url. length- 7]
81. # exclude css & js & '#'(eg http://www-cs.stanford.edu/People/faculty#Regular%20Faculty)...
82. if Util.exclude(url)==nil||url.include?( ' # ')
83. return nil
84. end
85. # full path
86. if url[ 0, 4]!= ' http '
87. while url. index( ' / ')== 0
88. url=url.slice( 1,url. length- 1)
89. end
90. url=f_url+ ' / '+url
91. end
92. return url
93. end
94. # format email
95. def Util.format_email email
96. email=email. delete( ' > '). delete( ' < '). delete( ' mailto: '). delete( ' " ').strip
97. if String( $email_regexp.match(email))== email
98. return email.downcase
99. else
100. return nil
101. end
102. end
103. # write msg to file
104. def Util. write fname,msg
105. file=File.new(fname, ' a ')
106. file<<msg
107. file. close()
108. end
109. # exclude css & js...
110. def Util.exclude str
111. ex=[ ' css ', ' js ', ' pdf ', ' jpg ']
112. ex. each do |e|
113. index=e. length+ 1
114. if str. length> index && str[- index, index]== ' . '+e
115. return nil
116. end
117. end
118. return str
119. end
120.end
121. $count= 1
122.0.upto( $max_depth) do |i|
123. if $debug
124. puts ' ~~depth-> '+String(i)
125. end
126. if i== 0
127. c=Crawl.new( $url,i+ 1)
128. c.get_links
129. c.get_emails
130. $links_crawled. push [i, $url]
131. end
132. # breadth first
133. while $links_stack. length!= 0
134. if $debug
135. puts ' ~~count-> '+String( $count)+ ' ,stack-> '+String( $links_stack. length)+ ' ,crawled-> '+String( $links_crawled. length)+ ' ,total-> '+String( $links_crawled. length+ $links_stack. length)
136. $count= $count+ 1
137. end
138. # Thread.abort_on_exception = true
139. threads = []
140. if $links_stack. length/ $thread_num>= 1
141. ts= $thread_num
142. else
143. ts= $links_stack. length% $thread_num
144. end
145. ts. times { |i|
146. threads << Thread.new(i) {
147. Mutex.new.synchronize {
148. if ( $links_crawled. length+ $links_stack. length)<= $max_pages&&i!= $max_depth
149. link= $links_stack. shift # fifo
150. if link[ 0]==i+ 1
151. # read links & emails from pages in stack
152. c=Crawl.new( link[ 1],i+ 2)
153. c.get_links
154. c.get_emails
155. $links_crawled. push link[ 1]
156. else
157. break
158. end
159. else
160. # only read emails from pages in stack
161. link= $links_stack. shift
162. c=Crawl.new( link[ 1],i+ 2)
163. c.get_emails
164. $links_crawled. push link[ 1]
165. end
166. }
167. }
168. }
169. threads. each{|t|t. join}
170. end
171.end