原谅我用英文标题。。。因为我觉得用英文来表示更贴切一些。。这个爬虫用于搜集UCI的机器学习知识库的一些背景资料和下载链接。主要是对jsoup包的运用,然后用jxl包把资料写入到excel里面。
每个数据集都有下列信息:
Name,AssociatedTasks,AttributeCharacteristics,NumberOfInstances,
NumberOfAttributes,DataFull,Year,HitTimes,DataSetInformation,
AttributeInformation, DownloadLink
如果有信息缺失,则用-1或者N/A代替
保存结果的Excel被放置在桌面上,名字为 “aaa.xls”
现在版本的缺陷就是数据集的名字显示有点问题,空格他自动用+号代替,左括号用%28代替,右括号用%29代替。暂时还没有改过来。
下面是代码
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.* ;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import jxl.write.Label ;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class UCIData {
//类的成员
public int Num ;
public String Name ;
public String AssociatedTasks ;
public String AttributeCharacteristics ;
public int NumberofInstances ;
public int NumberofAttributes ;
public boolean DataFull ;
public int Year ;
public int HitTimes ;
public String DataSetInformation ;
public String AttributeInformation ;
public String DownloadLink ;
public static void main(String[] args) throws IOException, RowsExceededException, WriteException {
String filename="C:\\Users\\multiangle\\Desktop\\aaa.xls" ;
try {
ExcelTitlePrint(filename) ;
} catch (RowsExceededException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (WriteException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String url="http://archive.ics.uci.edu/ml/datasets.html" ;
Document doc=getPage(url) ;
UCIData[] dataset ;
dataset=FindData(doc) ;
}
public UCIData(){ //Construction Method
Num=-1 ;
Name=null ;
AssociatedTasks=null ;
AttributeCharacteristics=null ;
NumberofInstances=-1 ;
NumberofAttributes=-1 ;
DataFull=false ;
Year=-1 ;
HitTimes=-1 ;
DataSetInformation=null ;
AttributeInformation=null ;
DownloadLink=null ;
}
public static UCIData[] FindData(Document doc) throws IOException, RowsExceededException, WriteException{
/**
* 用来搜集各个数据集的信息
*/
Element table=doc.getElementsByTag("table").get(1);
Element td=table.getElementsByTag("td").first().nextElementSibling();
Element table2=td.getElementsByTag("table").first().nextElementSibling();
int listnum=table2.getElementsByTag("tr").first().siblingElements().size() ;
UCIData[] dataset=new UCIData[listnum] ;
//listnum=10 ; //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7此处运行时删掉
for(int i=0;i<listnum;i++){
Element setlist=table2.getElementsByTag("tr").first().siblingElements().get(i) ; //具体到条
Element td2=setlist.getElementsByTag("a").first() ;
String name=td2.attr("href").substring(9) ; //还可以提升,把+号和%去掉
System.out.println("正在搜集第"+i+"个数据集: "+name);
String link=td2.absUrl("href") ;
Document subpage=getPage(link) ;
dataset[i]=InfoCutPage(subpage); //补充datase[i]的各项值
dataset[i].Num=i+1 ;
dataset[i].Name=name ;
print (dataset[i]) ;
ExcelDataPrint(dataset[i],"C:\\Users\\multiangle\\Desktop\\aaa.xls") ;
}
return dataset ;
}
public static UCIData InfoCutPage(Document doc) throws IOException{
/**
* 用来搜集单个数据集的信息
*/
UCIData data=new UCIData() ;
Element table=doc.getElementsByTag("table").first().siblingElements().get(1) ;
Element td=table.getElementsByTag("td").first();
Element table2=td.getElementsByTag("table").first().nextElementSibling() ; //Simple Information
Element tr=table2.getElementsByTag("tr").first() ; //the 1st line of simple information
Element td2=tr.getElementsByTag("td").first().siblingElements().get(2) ;
if(!td2.text().equals("N/A")) data.NumberofInstances= Integer.parseInt(td2.text()) ;
tr=tr.nextElementSibling() ; //the 2ed line of simple information
td2=tr.getElementsByTag("td").first().nextElementSibling(); //AttributeCharacteristics
data.AttributeCharacteristics=td2.text();
td2=td2.nextElementSibling().nextElementSibling() ; //NumberofAttributes
if(!td2.text().equals("N/A")) data.NumberofAttributes=Integer.parseInt(td2.text()) ;
td2=td2.nextElementSibling().nextElementSibling() ; //Date
if (!td2.text().equals("N/A")) data.Year=Integer.parseInt(td2.text().substring(0,4)) ;
tr=tr.nextElementSibling() ; //the 3rd line of simple information
td2=tr.getElementsByTag("td").first().nextElementSibling(); //Associated Tasks
data.AssociatedTasks=td2.text() ;
td2=td2.nextElementSibling().nextElementSibling() ; //DataFull
if (td2.text().equals("No")) data.DataFull=true ;
else data.DataFull=false ;
td2=td2.nextElementSibling().nextElementSibling() ; //HitTimes
data.HitTimes=Integer.parseInt(td2.text()) ;
Element p=table2.siblingElements().get(6) ;
data.DataSetInformation=p.text() ; //DataSetInformation
p=p.nextElementSibling().nextElementSibling().nextElementSibling() ;
data.AttributeInformation=p.text() ; //AttributeInformation
String downlink_pre=cutPage1(doc) ;
data.DownloadLink=downlink_pre ;
return data ;
}
public static String cutPage1(Document doc) throws IOException{
Element font=doc.getElementsByTag("font").get(5).parent() ;
String link=font.absUrl("href") ;
return link ;
}
public static String cutPage2(Document doc) throws IOException{
Element tr=doc.getElementsByTag("tr").get(3) ;
Element link=tr.getElementsByTag("a").first() ;
String href=link.absUrl("href") ;
//System.out.println(href);
return href ;
}
public static Document getPage_inner(String url) throws IOException{
Document doc ;
try{
doc=Jsoup.connect(url).get() ;
return doc ;
}catch(IOException e){
return null;
}
}
public static Document getPage(String url) throws IOException{
/**
* 用来获取页面代码,与上面的getPage_inner一道,有多次重连功能
*/
try{
Document doc ;
int times=0 ;
//System.out.println("正在请求获取网页"+url);
doc=getPage_inner(url) ;
while(doc.equals(null)&×<8){
doc=getPage_inner(url) ;
System.out.println("第"+times+"次请求失败,正在进行第"+(++times)+"次请求");
}
if (doc.equals(null)){
System.out.println("ERROR:获取网页失败");
return null ;
}else{
return doc ;
}
}catch(IOException e){
return null ;
}
}
public static void print(UCIData data){
System.out.println("Num: "+data.Num) ;
System.out.println("Name: "+data.Name) ;
System.out.println("AssociatedTasks: "+data.AssociatedTasks) ;
System.out.println("AttributeCharacteristics: "+data.AttributeCharacteristics) ;
System.out.println("NumberofInstances: "+data.NumberofInstances) ;
System.out.println("NumberofAttributes: "+data.NumberofAttributes) ;
System.out.println("DataFull: "+data.DataFull) ;
System.out.println("Year: "+data.Year) ;
System.out.println("HitTimes: "+data.HitTimes) ;
System.out.println("DataSetInformation: "+data.DataSetInformation) ;
System.out.println("AttributeInformation: "+data.AttributeInformation) ;
System.out.println("DownloadLink: "+data.DownloadLink) ;
}
public static void ExcelDataPrint(UCIData data,String filename) throws RowsExceededException, WriteException {
try {
File file=new File(filename) ;
Workbook wb = Workbook.getWorkbook(file);
WritableWorkbook book = Workbook.createWorkbook(file,wb);
WritableSheet sheet = book.getSheet(0) ;
int rownum=data.Num+1;
int colnum=0 ;
jxl.write.Number num=new jxl.write.Number(colnum++,rownum,data.Num) ;
Label name=new Label(colnum++,rownum,data.Name);
Label associatetasks=new Label(colnum++,rownum,data.AssociatedTasks);
Label attritebutecharacters=new Label(colnum++,rownum,data.AttributeCharacteristics);
jxl.write.Number instance=new jxl.write.Number(colnum++,rownum,data.NumberofInstances) ;
jxl.write.Number attributes=new jxl.write.Number(colnum++,rownum,data.NumberofAttributes) ;
String datafull ;
if (data.DataFull==true) datafull="true" ; else datafull="false" ;
Label dataful=new Label(colnum++,rownum,datafull) ;
jxl.write.Number year=new jxl.write.Number(colnum++,rownum,data.Year) ;
jxl.write.Number hittimes=new jxl.write.Number(colnum++,rownum,data.HitTimes) ;
Label datasetinfo=new Label(colnum++,rownum,data.DataSetInformation);
Label attributeinfo=new Label(colnum++,rownum,data.AttributeInformation);
Label downlink=new Label(colnum++,rownum,data.DownloadLink);
sheet.addCell(num);
sheet.addCell(name);
sheet.addCell(associatetasks);
sheet.addCell(attritebutecharacters);
sheet.addCell(instance);
sheet.addCell(attributes);
sheet.addCell(dataful);
sheet.addCell(year);
sheet.addCell(hittimes);
sheet.addCell(datasetinfo);
sheet.addCell(attributeinfo);
sheet.addCell(downlink);
book.write();
book.close();
} catch (BiffException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void ExcelTitlePrint(String filename) throws RowsExceededException, WriteException{
try {
File file=new File(filename) ;
WritableWorkbook book=Workbook.createWorkbook(file);
WritableSheet sheet=book.createSheet("FirstPage",0) ;
String[] colname={"Num" ,"Name","AssociatedTasks",
"AttributeCharacteristics","NumberOfInstances","NumberOfAttributes",
"DataFull","Year","HitTimes",
"DataSetInformation","AttributeInformation","DownloadLink"};
for(int i=0;i<colname.length;i++){
Label label=new Label(i,0,colname[i]) ;
sheet.addCell(label);
}
book.write();
book.close();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("ERROR:创建Excel文件失败");
} //打开文件
}
}