目前在做Perl页面爬虫的模块,发现一些代码,做个详细的分析,把好的引用一下给自己用用。
1 #!/usr/bin/perl -w
2
3 use strict;
4
5 use HTTP::Request;
6 use HTTP::Status;
7 use HTML::LinkExtor;
8 use URI::URL;
9 use LWP::UserAgent;
10 #use Digest::MD5 qw(md5_hex);
11
12
13 use Compress::Zlib;
14
15 ####################################################################
16 # Parameters Setting
17 our $StartUrl = "http://xxx";
18 our $bRestrict = 1;
19 our @restrictSite = ('cxxx','context:');
20 our $bContinueBefore = 1;
21
22
23 ####################################################################
24
25
26 print __FILE__,"\n";
27
28 our %img_seen = ();
29 our %url_seen = ();
30 our @url_queue = ();
31 our %url_processed = ();
32
33 our %RobotDisallow = ();
34 our %RobotAllow = ();
35 our %site_seen = ();
36
37
38 if($bContinueBefore){
39 &LoadBefore();
40 }else{
41 $url_seen{$StartUrl} = 1;
42 push @url_queue, $StartUrl;
43 }
44
45 our $pageNum = 0;
46 our $BucketNum = 0;
47
48 &OpenOutFile();
49
50 open(URLHASH,">>urlhash.txt") or die;
51 open(URLPROCESSED,">>urlprocessed.txt") or die;
52 open(URLREDIRECT,">>urlredirect.txt") or die;
53 open(PAGELIST,">>pagelist.txt") or die;
54 open(IMGLIST,">>imglist.txt") or die;
55
56
57 $| = 1, select $_ for select URLHASH;
58 $| = 1, select $_ for select URLPROCESSED;
59 $| = 1, select $_ for select URLREDIRECT;
60 $| = 1, select $_ for select PAGELIST;
61 $| = 1, select $_ for select IMGLIST;
62
63 our $urlhash_log = *URLHASH;
64 our $urlprocessed_log = *URLPROCESSED;
65 our $urlredirect_log = *URLREDIRECT;
66 our $pagelist_log = *PAGELIST;
67 our $imglist_log = *IMGLIST;
68
69
70 our $UA = new LWP::UserAgent(keep_alive => 1,
71 timeout => 60,
72 );
73 $UA->agent('Mozilla/5.0');
74 $UA->proxy(['ftp', 'http', 'wais', 'gopher'],'http://jpproxy:80/');
75
76 our $linkExtor = new HTML::LinkExtor(\&linkCallback);
77 our @tmpLinks = ();
78 our @tmpImgs = ();
79
80 my $url;
81 while ( $url = &next_url() )
82 {
83 print $urlprocessed_log $url,"\n";
84
85 #sleep(1000);
86
87 my $response = &get_url( $url );
88
89 if(!defined $response){
90 next;
91 }
92
93 my $base = $response->base;
94 $base = $base->as_string;
95 #$base =~ tr/A-Z/a-z/;
96
97 if ( $base ne $url )
98 {
99 if(!&ValidUrl($base)){
100 next;
101 }
102
103 print $urlredirect_log $url,"\t",$base,"\n";
104
105 $url_seen{$base} ++;
106 print $urlhash_log $base,"\n";
107
108 if(exists($url_processed{$base})){
109 next;
110 }
111 }
112
113 my $contents = $response->content;
114
115 #my $digest = md5_hex($base);
116
117 &SavePage(\$base,\$contents);
118 print $pagelist_log $base,"\n";
119 $url_processed{$base} ++;
120
121
122 @tmpLinks = ();
123 @tmpImgs = ();
124 $linkExtor->parse($contents);
125
126 foreach (@tmpLinks){
127 $_ = URI::URL->new($_,$base)->abs->as_string;
128 #$_ =~ tr/A-Z/a-z/;
129 }
130
131 foreach (@tmpImgs){
132 $_ = URI::URL->new($_,$base)->abs->as_string;
133 #$_ =~ tr/A-Z/a-z/;
134 }
135
136 #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks;
137 #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs;
138
139 &RecordLinks();
140 &RecordImgs();
141
142 }
143
144
145
146 sub next_url
147 {
148
149 # We return 'undef' to signify no URLs on the list
150 if (@url_queue == 0 )
151 {
152 return undef;
153 }
154
155 return shift @url_queue;
156 }
157
158 sub get_url
159 {
160 my $url = shift;
161
162 my $request = new HTTP::Request( 'HEAD', $url );
163 return undef unless $request;
164
165 my $response = $UA->request( $request );
166 return undef unless defined $response;
167 return undef unless $response->is_success;
168
169 my $content_type = $response->content_type();
170 return undef unless defined $content_type;
171
172 return undef if 'text/html' ne $content_type;
173
174 $request = new HTTP::Request( 'GET', $url );
175 return undef unless $request;
176
177 $response = $UA->request( $request );
178 return undef unless defined $response;
179 return undef unless $response->is_success;
180
181 return $response;
182 }
183
184 sub linkCallback
185 {
186 my($tag, %attr) = @_;
187 if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){
188 push(@tmpLinks,values %attr);
189 return;
190 }
191 if($tag eq 'img'){
192 push(@tmpImgs,values %attr);
193 return;
194 }
195 return;
196 }
197
198 sub RecordLinks
199 {
200 foreach (@tmpLinks){
201 if(/\/.+\.(\w{1,4})$/){
202 if($1 =~ /(html|htm|asp|php|jsp)/i){
203
204 }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){
205 if(/^http/i){
206
207 if(exists($img_seen{$_})){
208 next;
209 }
210
211 $img_seen{$_} = 1;
212 print $imglist_log $_,"\n";
213
214 }
215 next;
216
217 }else{
218 next;
219 }
220 }
221
222 #if(/\.(gif|jpg|jpeg|png|xbm|au|wav|mpg|pdf|ps|mp3|mp2|rm|zip|rar|gz|zip)$/i){
223 # next;
224 #}
225
226 if(/^http/i){
227
228 if(!&ValidUrl($_)){
229 next;
230 }
231
232 s/#.*//;
233
234 if(exists($url_seen{$_})){
235 next;
236 }
237
238 $url_seen{$_} = 1;
239 push @url_queue,$_;
240 print $urlhash_log $_,"\n";
241 }
242 }
243 }
244
245 sub RecordImgs
246 {
247 foreach (@tmpImgs){
248 if(/^http/i){
249 if(!&ValidImage($_)){
250 next;
251 }
252
253 if(exists($img_seen{$_})){
254 next;
255 }
256
257 $img_seen{$_} = 1;
258 print $imglist_log $_,"\n";
259
260 }
261 }
262 }
263
264
265 sub LoadBefore
266 {
267 open(FILE, "urlprocessed.txt") or die;
268 while(<FILE>){
269 chomp;
270 $url_processed{$_}++;
271 }
272
273 open(FILE, "pagelist.txt") or die;
274 while(<FILE>){
275 if(/(\S+)\s/){
276 $url_processed{$1}++;
277 }
278 }
279
280 open(FILE, "urlhash.txt") or die;
281 while(<FILE>){
282 chomp;
283 $url_seen{$_}++;
284 if(!exists($url_processed{$_})){
285 push @url_queue,$_;
286 }
287 }
288
289 open(FILE, "imglist.txt") or die;
290 while(<FILE>){
291 chomp;
292 $img_seen{$_}++;
293 }
294
295 }
296
297
298 sub ValidUrl
299 {
300 my($url) = shift;
301 if($bRestrict){
302 foreach (@restrictSite){
303 if($url =~ /$_/){
304 return 1;
305 }
306 }
307 return 0;
308 }else{
309 return 1;
310 }
311 }
312
313 sub ValidImage
314 {
315 my($url) = shift;
316 if($url =~ /#/){
317 return 0;
318 }
319
320 if(/spacer\.gif/){
321 return 0;
322 }
323
324 return 1;
325 }
326
327
328 sub get_robotstxt
329 {
330 my $url = shift;
331 $url .= "/robots.txt";
332
333 my $request = new HTTP::Request( 'HEAD', $url );
334 return undef unless $request;
335
336 my $response = $UA->request( $request );
337 return undef unless defined $response;
338 return undef unless $response->is_success;
339
340 my $content_type = $response->content_type();
341 return undef unless defined $content_type;
342
343 return undef if 'text/plain' ne $content_type;
344
345 $request = new HTTP::Request( 'GET', $url );
346 return undef unless $request;
347
348 $response = $UA->request( $request );
349 return undef unless defined $response;
350 return undef unless $response->is_success;
351
352 return $response;
353 }
354
355 sub OpenOutFile
356 {
357 $BucketNum ++;
358 my $fname = sprintf("PageBucket.%05d",$BucketNum);
359 open(PAGEBUCKET,">>$fname") or die;
360 binmode(PAGEBUCKET);
361 $| = 1, select $_ for select PAGEBUCKET;
362 }
363
364 sub SavePage
365 {
366 my($urlR,$contR) = @_;
367 my $data = compress($$contR);
368 my $len = pack('I',length($$urlR));
369 print PAGEBUCKET $len;
370 print PAGEBUCKET $$urlR;
371 $len = pack('I',length($data));
372 print PAGEBUCKET $len;
373 print PAGEBUCKET $data;
374
375 $pageNum++;
376 if($pageNum % 1000 == 0){
377 print "$pageNum pages have been crawled!\n";
378 }
379 if($pageNum % 100000 == 0){
380 &OpenOutFile;
381 }
382 }