此程序提供简单的获取html 页面代码并筛选出以下标签和一些基本属性:
<script> : 属性 src, type
<a> : 属性 href
<img>: 属性 src
后续会添加一些更有用的功能,并逐步完善命令行接口。
使用方法:
perl filter_html.pl <URL>
#!/usr/bin/perl
# --------------------------
# author : ez
# date : 2015/8/23
# describe : this script send http request for a http url and filter some special
# tag you input
# --------------------------
use strict;
use warnings;
use LWP::UserAgent;
use Data::Dumper;
use HTML::TreeBuilder;
# use HTML::Parser;
our $VERSION = 1.0;
my %disp_func = (
a => sub {
my $em = shift;
return if ! defined ($em) and $em -> tag () ne 'a';
my $href = $em -> attr ('href');
print "a url = "
. ($href ? $href : 'none') . "\n";
},
script => sub {
my $em = shift;
return if ! defined ($em) and $em -> tag () ne 'script';
my $type = $em -> attr ('type');
my $src = $em -> attr ('src');
print "script type = "
. ($type ? $type : 'none') . ", src = "
. ($src ? $src : 'none') . "\n";
},
img => sub {
my $em = shift;
return if ! defined ($em) and $em -> tag () ne 'img';
my $src = $em -> attr ('src');
print "img src = "
. ($src ? $src : 'none') . "\n";
}
);
&_usage () if @ARGV < 1;
my $url = shift @ARGV;
my @tags = qw(a script form img);
@tags = @ARGV if @ARGV >= 1;
my $useragent = LWP::UserAgent -> new;
my $request = HTTP::Request -> new ('GET' => $url);
$request -> content_type ('application/x-www-form-urlencoded');
$request -> header ('Accept-Language' => 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3');
print "[-] sending request to $url ...\n";
my $html = $useragent -> request ($request);
print "[-] get response !\n";
my $tree = new HTML::TreeBuilder;
$tree -> parse ($html -> content ());
$tree -> eof ();
my $html_tag = $tree -> elementify ();
# my @decendants = $html_tag -> descendants ();
# maybe the parameter could be more exciting :-)
my @find_tags = $html_tag -> find_by_tag_name ('a', 'script', 'img');
foreach (@find_tags) {
next if !defined ($_) and $_ -> tag () eq '';
&{$disp_func {$_ -> tag ()}} ($_);
}
$tree -> delete ();
sub get_tags {
my ($tag, $node) = @_;
return if ! $tag;
}
sub _usage {
print "usage: filter_html.pl <webpage_url>";
exit;
}
# debug
# my $tag = $val -> tag (); # get 'html'
# TODO : parse start
# my $items = $tree -> findnodes ('/html/body//a');
# for my $item ($items -> get_nodelist ()) {
# my $str = $item -> content -> [0];
# print "$str\n";
# }
# print $html -> as_string ();
# print $html -> content ();
# my @line = $html -> content ();
# /<(\S*?)[^>]>.*?<\/\1>|<.*?\/>/
# foreach (@line) {
# print "$_\n";
# }
__END__
注:perl中可能没有需要的HTML::TreeBuilder模块和Data::Dumper模块,可以CPAN自己下载安装。
运行环境: Linux 3.18.0-kail3-amd64 #1 SMP Debian x86_64 GNU / Linux
Perl: v5.14.2 built for x86_64-linux-gnu-thread-multi