接触ruby第二天用它写的一个网络爬虫程序

最新推荐文章于 2022-04-21 12:29:09 发布

a5667903

最新推荐文章于 2022-04-21 12:29:09 发布

阅读量663

点赞数

文章标签： ruby 爬虫数据结构与算法

原文链接：http://www.cnblogs.com/zheshexian/archive/2011/10/25/2223681.html

版权

这个程序写的很简单，刚接触ruby第二天写的，写于2009年12月，主要完成的功能是到斯坦福大学的网站上去收集email地址，，默认是10个线程，策略是广度优先，$debug=true时开启调试信息。附件中包含代码和批处理文件。

运行命令为：
ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
其中参数：2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->URL
运行结果输出为文档文件emails_md[max_depth]_mp[max_pages]_[URL].txt

1. require ' open-uri '
2. require ' thread '
3. # run it like this :
4. # ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
5. # regexp
6. $link_regexp = /href\=\ " [^\"]*\"/
7.$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/  #mailto:xx@xxxx "
8. $email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/   # >xx@xx<
9. $before_at = /[a-zA-Z0- 9]+[_?a-zA-Z0- 9]+/
10. $after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
11. $email_regexp=/ # {$before_at}\@#{$after_at}/ #xx@xx.xx
12.
13. # ARGV
14. if ARGV==nil||ARGV. length< 3
15.    puts ' -- Command -- '
16.    puts ' ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People '
17.    puts ' help: 2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->url '
18.     exit( 0)
19.end
20. $url=ARGV[ 2]
21. $max_depth=ARGV[ 0].to_i
22. $max_pages=ARGV[ 1].to_i
23. $fname= ' emails_md '+String( $max_depth)+ ' _mp '+String( $max_pages)+ ' _ '+ $url.gsub(/[\/\:]/, ' _ ')+ ' .txt '
24. $fname_links= ' links_md '+String( $max_depth)+ ' _mp '+String( $max_pages)+ ' _ '+ $url.gsub(/[\/\:]/, ' _ ')+ ' .txt '
25. $thread_num= 10
26. $debug=true
27.
28. $links_stack=[]      # fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
29. $links_crawled=[]    # [url1,url2,url3,...]
30. $emails=[]       # [email1,email2,email3,...]
31.
32.class Crawl
33.    def initialize url,depth
34.         @url=url
35.         while @url[- 1, 1]== ' / '
36.             @url= @url.slice( 0, @url. length- 1)
37.        end
38.         @depth=depth
39.        begin
40.         @html= open( @url). read
41.        rescue
42.         @html= ''
43.        end
44.    end
45.    def get_links
46.         @html.scan( $link_regexp) do |match|
47.            u=Util.format_url(match, @url)
48.             if !(u==nil)&&! $links_crawled.include?(match)&& $links_stack.rassoc(match)==nil
49.                 $links_stack. push [ @depth,u]
50.            end
51.        end
52.    end
53.    def get_emails
54.         @html.scan( $email_regexp_1) do |match|
55.            match=Util.format_email(match)
56.             if match!=nil&&! $emails.include?(match)
57.                 $emails. push match
58.                msg= match+ ' , '+ @url
59.                puts msg
60.                Util. write( $fname,msg+ " \r\n ")
61.            end
62.        end
63.         @html.scan( $email_regexp_2) do |match|
64.            match=Util.format_email(match)
65.             if match!=nil&&! $emails.include?(match)
66.                 $emails. push match
67.                msg= match+ ' , '+ @url
68.                puts msg
69.                Util. write( $fname,msg+ " \r\n ")
70.            end
71.        end
72.    end
73.end
74.
75.class Util
76.     # format url
77.    def Util.format_url url,f_url
78.         # remove 'www-'
79.        f_url=f_url.gsub(/www\-/, '')
80.        url=url[ 6,url. length- 7]
81.         # exclude css & js & '#'(eg http://www-cs.stanford.edu/People/faculty#Regular%20Faculty)...
82.         if Util.exclude(url)==nil||url.include?( ' # ')
83.             return nil
84.        end
85.         # full path
86.         if url[ 0, 4]!= ' http '
87.             while url. index( ' / ')== 0
88.                url=url.slice( 1,url. length- 1)
89.            end
90.            url=f_url+ ' / '+url
91.        end
92.         return url
93.    end
94.     # format email
95.    def Util.format_email email
96.        email=email. delete( ' > '). delete( ' < '). delete( ' mailto: '). delete( ' " ').strip
97.         if String( $email_regexp.match(email))== email
98.             return email.downcase
99.         else
100.             return nil
101.        end
102.    end
103.     # write msg to file
104.    def Util. write fname,msg
105.        file=File.new(fname, ' a ')
106.        file<<msg
107.        file. close()
108.    end
109.     # exclude css & js...
110.    def Util.exclude str
111.        ex=[ ' css ', ' js ', ' pdf ', ' jpg ']
112.        ex. each do |e|
113.             index=e. length+ 1
114.             if str. length> index && str[- index, index]== ' . '+e
115.                 return nil
116.            end
117.        end
118.         return str
119.    end
120.end
121. $count= 1
122.0.upto( $max_depth) do |i|
123.     if $debug
124.        puts ' ~~depth-> '+String(i)
125.    end
126.     if i== 0
127.        c=Crawl.new( $url,i+ 1)
128.        c.get_links
129.        c.get_emails
130.         $links_crawled. push [i, $url]
131.    end
132.     # breadth first
133.     while $links_stack. length!= 0
134.         if $debug
135.            puts ' ~~count-> '+String( $count)+ ' ,stack-> '+String( $links_stack. length)+ ' ,crawled-> '+String( $links_crawled. length)+ ' ,total-> '+String( $links_crawled. length+ $links_stack. length)
136.             $count= $count+ 1
137.        end
138.         # Thread.abort_on_exception = true
139.        threads = []
140.         if $links_stack. length/ $thread_num>= 1
141.            ts= $thread_num
142.         else
143.            ts= $links_stack. length% $thread_num
144.        end
145.        ts. times { |i|
146.            threads << Thread.new(i) {
147.            Mutex.new.synchronize {
148.                 if ( $links_crawled. length+ $links_stack. length)<= $max_pages&&i!= $max_depth
149.                     link= $links_stack. shift      # fifo
150.                     if   link[ 0]==i+ 1
151.                         # read links & emails from pages in stack
152.                        c=Crawl.new( link[ 1],i+ 2)
153.                        c.get_links
154.                        c.get_emails
155.                         $links_crawled. push link[ 1]
156.                     else
157.                        break
158.                    end
159.                 else
160.                     # only read emails from pages in stack
161.                     link= $links_stack. shift
162.                    c=Crawl.new( link[ 1],i+ 2)
163.                    c.get_emails
164.                     $links_crawled. push link[ 1]
165.                end
166.            }
167.            }
168.        }
169.        threads. each{|t|t. join}
170.    end
171.end

转载于:https://www.cnblogs.com/zheshexian/archive/2011/10/25/2223681.html