jsoup功能很强大,我用它来解析网页很轻松。但我只用了它一小部分功能,已经足矣。现在是我解析百度的一个小示例
package top100.bis;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import top100.bean.AlbumInfo;
public class Top100AlbumInfoDown {
/**
* @author hz
* 用于http://www.top100.cn巨鲸专辑下载
*/
private static String basePath = "http://www.top100.cn";
/**
* 默认构造器
*/
private Top100AlbumInfoDown(){}
/**
* 给定专辑列表页面,返回AlbumInfo列表
* @param url 带有分页参数的专辑列表地址,如:http://www.top100.cn/album/index.php?area=mainland&pages=1
* @return list
*/
private static List<AlbumInfo> parser(String urlstr){
List<AlbumInfo> infoList = new ArrayList<AlbumInfo>();
System.out.println("当前请求的url是"+urlstr);
try{
Document doc=Jsoup.connect(urlstr).timeout(10000).get(); //超时时间设置很有用
wash(doc);
String classStr1 = "Listen_allsingerbox";
Elements infoDivList1 = doc.getElementsByClass(classStr1);
if(infoDivList1==null){
return null;
}
for(Element infoDiv1:infoDivList1){
AlbumInfo albumInfo1 = getAlbumInfo(urlstr,infoDiv1);
infoList.add(albumInfo1);
}
String classStr2 = "Listen_allsingerbox top100_bgf2";
Elements infoDivList2 = doc.getElementsByClass(classStr2);
if(infoDivList2==null){
return null;
}
for(Element infoDiv2:infoDivList2){
AlbumInfo albumInfo2 = getAlbumInfo(urlstr,infoDiv2);
infoList.add(albumInfo2);
}
} catch (MalformedURLException e) {
e.printStackTrace();
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
}
return infoList;
}
/**
* 抓取每个div中的albumInfo
* @param url
* @param infoDiv
* @return albumInfo
*/
private static AlbumInfo getAlbumInfo(String url,Element infoDiv){
AlbumInfo albumInfo = new AlbumInfo();
Element imageLink = infoDiv.getElementsByTag("a").first();
Element image=imageLink.select("img[src$=.jpg]").first();
String imageUrl=image.attr("src");
albumInfo.setAlbumImageUrl(imageUrl);
Element songLink =infoDiv.getElementsByTag("a").get(1);
String linkHref = songLink.attr("href");
albumInfo.setAlbumSongLinkUrl(basePath+linkHref);
String albumName = songLink.attr("title");
albumInfo.setAlbumName(albumName.trim());
Element abstructLink = infoDiv.getElementsByTag("p").first();
String abstruct = String.valueOf(abstructLink).replaceAll("<.*?>", "");
albumInfo.setAlbumAbstruct(abstruct);
int startIndex = url.indexOf("=");
int endIndex = url.lastIndexOf("&");
String area = url.substring(startIndex+1,endIndex);
albumInfo.setAlbumArea(area);
return albumInfo;
}
public static List<AlbumInfo> getAlbumInfoByUrl(String url){
List<AlbumInfo> list = new ArrayList<AlbumInfo>();
for(int i=1;i<SysConstant.PAGE;i++){
String temp = "&pages=";
String site = url+temp+i;//拼出带页码参数的专辑网址
//System.out.println(site);
if(site == null || "".equals(site)){
continue;
}
List<AlbumInfo> top100AlbumInfoList = parser(site);
if(top100AlbumInfoList==null){
continue;
}
list.addAll(top100AlbumInfoList);
}
HashSet<AlbumInfo> hashSet = new HashSet<AlbumInfo>(list);
list.clear();
list.addAll(hashSet);
return list ;
}
private static void wash(Document doc)
{
Elements script=doc.select("script");
for(Element element : script)
{
element.remove();
}
Elements form=doc.select("form");
for(Element element : form)
{
element.remove();
}
Elements meta=doc.select("meta");
for(Element element : meta)
{
element.remove();
}
Elements style=doc.select("style");
for(Element element : style)
{
element.remove();
}
Elements iframe=doc.select("iframe");
for(Element element : iframe)
{
element.remove();
}
Elements font=doc.select("font");
for(Element element : font)
{
element.remove();
}
}
public static void main(String[] args) {
//Top100AlbumInfoDown down = new Top100AlbumInfoDown();
String url= "http://www.top100.cn/album/index.php?area=mainland";
List<AlbumInfo> list = getAlbumInfoByUrl(url);
for(AlbumInfo tempInfo:list){
System.out.println(tempInfo.getAlbumName()+"||"+tempInfo.getAlbumSongLinkUrl());
}
}
}