#!/usr/bin/perl
use warnings;
use strict;
no warnings "recursion";
use XML::Writer;
use LWP::Simple;
use Encode;
local *FILE;
select FILE;
my($xml,@urls,$html,$filename);
foreach('A'..'Z'){
my @visited;
$filename="medicine/".$_."-chinese-medicine";
open(FILE,">$filename") or die "Can't create file :$!/n";
$xml=new XML::Writer(OUTPUT =>/*FILE,DATA_MODE=>1,DATA_INDENT =>2);
$xml->xmlDecl('UTF-8','yes');
$xml->startTag('prescriptions');
my($next,$nextPage,@items,$item,$category);
$next=1;
$nextPage="http://www.zhong-yao.net/zyfj/"."$_/";
$category=$_;
while($next){
$html=get($nextPage);
$html=encode("UTF-8",$html);
while($html=~m{"/zyfj/[A-Z]//d+//d+/.html}g){
$item=$&;
$item=~s//"//g;
push @items,$item if not $item~~@items;
}
foreach(@items){
#已访问
if($_~~@visited){
;
}
else{
push @visited,$_;
my($html,$name,$fun,$source,$items,@items);
$html=get("http://www.zhong-yao.net/"."$_");
$html=encode("UTF-8",$html);
#方名
$html=~m{<h3><span>.*?</span></p>}s;
$name=$&;
$name=~m{<p><span>.*?</span></p>}s;
$name=$&;
$name=~s/<//?p>|<//?span>//g;
#出处
$html=~m{<h3><span>处方来源.*?</span></p>}s;
$source=$&;
$source=~m{<p><span>.*</span></p>}s;
$source=$&;
$source=~s/<//?p>|<//?span>|。//g;
#功能
$html=~m{(主治|功效)</span>.*?</span></p>}s;
$fun=$&;
$fun=~m{<p><span>.*</span></p>}s;
$fun=$&;
$fun=~s/<//?p>|<//?span>|。//g;
#组成成分
$html=~m{组成</span>.*?</span></p>}s;
$items=$&;
$items=~m{<p><span>.*?</span></p>}s;
$items=$&;
$items=~s/<.*?>|。|(.*?)//g;
@items=split(/,|、/,$items);
$xml->startTag('prescription','name' =>$name,
'source' =>$source);
$xml->emptyTag('function','value' =>$fun);
foreach(@items){
my($name,$quantity);
$name=$_;
$name=~s//d+.*|半.*//g;
$quantity=$_;
#$quantity=~s/(/D|~半升|~半两)+//;
if($_=~m{/d.*}){
$quantity=$&;
}
elsif($_=m{钱半|半钱|半升|半两|半勺}){
$quantity=$&;
}
else{
$quantity="";
}
$xml->emptyTag('item',
'name' =>$name,
'quantyty' =>$quantity);
}
$xml->endTag('prescription');
}
}
$next=0;
if($html=~/href.*?下一页</){
$nextPage=$&;
$nextPage=~s/.*?/'//;
$nextPage=~s//'.*//;
$nextPage="http://www.zhong-yao.net/zyfj/"."$category/"."$nextPage";
$next=1;
}
}
$xml->endTag('prescriptions');
$xml->end;
close(FILE);
}
exit(0);
用perl写的一个网络爬虫
最新推荐文章于 2024-07-11 16:18:48 发布