Spider for UCI Machine Learning Repository

原谅我用英文标题。。。因为我觉得用英文来表示更贴切一些。。这个爬虫用于搜集UCI的机器学习知识库的一些背景资料和下载链接。主要是对jsoup包的运用,然后用jxl包把资料写入到excel里面。
每个数据集都有下列信息:
Name,AssociatedTasks,AttributeCharacteristics,NumberOfInstances,
NumberOfAttributes,DataFull,Year,HitTimes,DataSetInformation,
AttributeInformation, DownloadLink
如果有信息缺失,则用-1或者N/A代替
保存结果的Excel被放置在桌面上,名字为 “aaa.xls”

现在版本的缺陷就是数据集的名字显示有点问题,空格他自动用+号代替,左括号用%28代替,右括号用%29代替。暂时还没有改过来。

下面是代码


import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 

import java.io.* ;

import jxl.Workbook; 
import jxl.read.biff.BiffException;
import jxl.write.Label ;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;


public class UCIData {
    //类的成员
    public int Num ; 
    public String Name ;
    public String AssociatedTasks ;
    public String AttributeCharacteristics ;
    public int NumberofInstances ;
    public int NumberofAttributes ;
    public boolean DataFull ;
    public int Year ;
    public int HitTimes ;

    public String DataSetInformation ;
    public String AttributeInformation ;

    public String DownloadLink ;



    public static void main(String[] args) throws IOException, RowsExceededException, WriteException {
        String filename="C:\\Users\\multiangle\\Desktop\\aaa.xls" ;
        try {
            ExcelTitlePrint(filename) ;
        } catch (RowsExceededException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (WriteException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        String url="http://archive.ics.uci.edu/ml/datasets.html" ;
        Document doc=getPage(url) ;
        UCIData[] dataset ;
        dataset=FindData(doc) ;
    }

    public UCIData(){       //Construction Method
        Num=-1 ;
        Name=null ;
        AssociatedTasks=null ;
        AttributeCharacteristics=null ;
        NumberofInstances=-1 ;
        NumberofAttributes=-1 ;
        DataFull=false ;
        Year=-1 ;
        HitTimes=-1 ;

        DataSetInformation=null ;
        AttributeInformation=null ;

        DownloadLink=null ;
    }


    public static UCIData[] FindData(Document doc) throws IOException, RowsExceededException, WriteException{
        /**
        *   用来搜集各个数据集的信息
        */
        Element table=doc.getElementsByTag("table").get(1);
        Element td=table.getElementsByTag("td").first().nextElementSibling();
        Element table2=td.getElementsByTag("table").first().nextElementSibling();
        int listnum=table2.getElementsByTag("tr").first().siblingElements().size() ;

        UCIData[] dataset=new UCIData[listnum] ;
        //listnum=10 ; //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7此处运行时删掉
        for(int i=0;i<listnum;i++){
            Element setlist=table2.getElementsByTag("tr").first().siblingElements().get(i) ;  //具体到条
            Element td2=setlist.getElementsByTag("a").first() ;
            String name=td2.attr("href").substring(9) ;  //还可以提升,把+号和%去掉

            System.out.println("正在搜集第"+i+"个数据集:  "+name); 
            String link=td2.absUrl("href") ;
            Document subpage=getPage(link) ;

            dataset[i]=InfoCutPage(subpage);  //补充datase[i]的各项值
            dataset[i].Num=i+1 ;
            dataset[i].Name=name ;
            print (dataset[i]) ;
            ExcelDataPrint(dataset[i],"C:\\Users\\multiangle\\Desktop\\aaa.xls") ;
        }

        return dataset ;

    }

    public static UCIData InfoCutPage(Document doc) throws IOException{  
        /**
        *   用来搜集单个数据集的信息
        */
        UCIData data=new UCIData() ;
        Element table=doc.getElementsByTag("table").first().siblingElements().get(1) ;
        Element td=table.getElementsByTag("td").first();

        Element table2=td.getElementsByTag("table").first().nextElementSibling() ; //Simple Information
            Element tr=table2.getElementsByTag("tr").first() ;      //the 1st line of simple information
                Element td2=tr.getElementsByTag("td").first().siblingElements().get(2) ;
                if(!td2.text().equals("N/A")) data.NumberofInstances= Integer.parseInt(td2.text()) ;
            tr=tr.nextElementSibling() ;                            //the 2ed line of simple information
                td2=tr.getElementsByTag("td").first().nextElementSibling();  //AttributeCharacteristics
                data.AttributeCharacteristics=td2.text();
                td2=td2.nextElementSibling().nextElementSibling() ;          //NumberofAttributes
                if(!td2.text().equals("N/A")) data.NumberofAttributes=Integer.parseInt(td2.text()) ; 
                td2=td2.nextElementSibling().nextElementSibling() ;          //Date
                if (!td2.text().equals("N/A")) data.Year=Integer.parseInt(td2.text().substring(0,4)) ;
            tr=tr.nextElementSibling() ;                            //the 3rd line of simple information
                td2=tr.getElementsByTag("td").first().nextElementSibling();  //Associated Tasks
                data.AssociatedTasks=td2.text() ;
                td2=td2.nextElementSibling().nextElementSibling() ;          //DataFull
                if (td2.text().equals("No")) data.DataFull=true ;
                else data.DataFull=false ;
                td2=td2.nextElementSibling().nextElementSibling() ;          //HitTimes
                data.HitTimes=Integer.parseInt(td2.text()) ;

        Element p=table2.siblingElements().get(6) ;
            data.DataSetInformation=p.text() ;                      //DataSetInformation
        p=p.nextElementSibling().nextElementSibling().nextElementSibling() ;
            data.AttributeInformation=p.text() ;                    //AttributeInformation

        String downlink_pre=cutPage1(doc) ;
        data.DownloadLink=downlink_pre ;
        return data ;
    }

    public static String cutPage1(Document doc) throws IOException{
        Element font=doc.getElementsByTag("font").get(5).parent() ;
        String link=font.absUrl("href") ;
        return link ;

    }
    public static String cutPage2(Document doc) throws IOException{
        Element tr=doc.getElementsByTag("tr").get(3) ;
        Element link=tr.getElementsByTag("a").first() ;
        String href=link.absUrl("href") ;
        //System.out.println(href);
        return href ;
    } 


    public static Document getPage_inner(String url) throws IOException{
        Document doc ;
        try{
            doc=Jsoup.connect(url).get() ;
            return doc ;
        }catch(IOException e){
            return null;
        }
    }
    public static Document getPage(String url) throws IOException{
        /**
        *   用来获取页面代码,与上面的getPage_inner一道,有多次重连功能
        */
        try{
            Document doc ;
            int times=0 ;
            //System.out.println("正在请求获取网页"+url);
            doc=getPage_inner(url) ;
            while(doc.equals(null)&&times<8){
                doc=getPage_inner(url) ;
                System.out.println("第"+times+"次请求失败,正在进行第"+(++times)+"次请求");
            }

            if (doc.equals(null)){
                System.out.println("ERROR:获取网页失败");
                return null ;
            }else{
                return doc ;
            }
        }catch(IOException e){
            return null ;
        }

    }


    public static void print(UCIData data){  
        System.out.println("Num:  "+data.Num) ;
        System.out.println("Name:  "+data.Name) ;
        System.out.println("AssociatedTasks:  "+data.AssociatedTasks) ;
        System.out.println("AttributeCharacteristics:  "+data.AttributeCharacteristics) ;
        System.out.println("NumberofInstances:  "+data.NumberofInstances) ;
        System.out.println("NumberofAttributes:  "+data.NumberofAttributes) ;
        System.out.println("DataFull:  "+data.DataFull) ;
        System.out.println("Year:  "+data.Year) ;
        System.out.println("HitTimes:  "+data.HitTimes) ;
        System.out.println("DataSetInformation:  "+data.DataSetInformation) ;
        System.out.println("AttributeInformation:  "+data.AttributeInformation) ;
        System.out.println("DownloadLink:  "+data.DownloadLink) ;
    }

    public static void ExcelDataPrint(UCIData data,String filename) throws RowsExceededException, WriteException {
        try {
            File file=new File(filename) ;
            Workbook wb = Workbook.getWorkbook(file);
            WritableWorkbook book = Workbook.createWorkbook(file,wb);
            WritableSheet sheet = book.getSheet(0) ;

            int rownum=data.Num+1;
            int colnum=0 ;
            jxl.write.Number num=new jxl.write.Number(colnum++,rownum,data.Num) ;
            Label name=new Label(colnum++,rownum,data.Name);
            Label associatetasks=new Label(colnum++,rownum,data.AssociatedTasks);
            Label attritebutecharacters=new Label(colnum++,rownum,data.AttributeCharacteristics);
            jxl.write.Number instance=new jxl.write.Number(colnum++,rownum,data.NumberofInstances) ;
            jxl.write.Number attributes=new jxl.write.Number(colnum++,rownum,data.NumberofAttributes) ;
            String datafull ;
            if (data.DataFull==true) datafull="true" ; else datafull="false" ;
            Label dataful=new Label(colnum++,rownum,datafull) ;
            jxl.write.Number year=new jxl.write.Number(colnum++,rownum,data.Year) ;
            jxl.write.Number hittimes=new jxl.write.Number(colnum++,rownum,data.HitTimes) ;
            Label datasetinfo=new Label(colnum++,rownum,data.DataSetInformation);
            Label attributeinfo=new Label(colnum++,rownum,data.AttributeInformation);
            Label downlink=new Label(colnum++,rownum,data.DownloadLink);

            sheet.addCell(num);
            sheet.addCell(name);
            sheet.addCell(associatetasks);
            sheet.addCell(attritebutecharacters);
            sheet.addCell(instance);
            sheet.addCell(attributes);
            sheet.addCell(dataful);
            sheet.addCell(year);
            sheet.addCell(hittimes);
            sheet.addCell(datasetinfo);
            sheet.addCell(attributeinfo);
            sheet.addCell(downlink);    

            book.write();
            book.close();
        } catch (BiffException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public static void ExcelTitlePrint(String filename) throws RowsExceededException, WriteException{
        try {
            File file=new File(filename) ;
            WritableWorkbook book=Workbook.createWorkbook(file);
            WritableSheet sheet=book.createSheet("FirstPage",0) ;
            String[] colname={"Num" ,"Name","AssociatedTasks",
                            "AttributeCharacteristics","NumberOfInstances","NumberOfAttributes",
                            "DataFull","Year","HitTimes",
                            "DataSetInformation","AttributeInformation","DownloadLink"};
            for(int i=0;i<colname.length;i++){
                Label label=new Label(i,0,colname[i]) ;
                sheet.addCell(label);
            }
            book.write();
            book.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            System.out.println("ERROR:创建Excel文件失败");
        }   //打开文件 

    }

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值