首先引入依赖:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class Crawling {
public static void main(String[] args) throws IOException {
for (int i = 0; i < 10; i++) {
int page = i * 25;
Document document = Jsoup.connect("https://movie.douban.com/top250?start=" + page).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:60.0) Gecko/20100101 Firefox/60.0").timeout(6000).get();
Elements items = document.getElementsByClass("item");
for (Element item : items) {
String num = item.getElementsByTag("em").get(0).text();
String title = item.getElementsByClass("hd").get(0).getElementsByTag("span").get(0).text();
String bd = item.getElementsByClass("bd").get(0).getElementsByTag("p").get(0).html();
bd = bd.replaceAll(" "," ");
//导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br> 1994 / 美国 / 犯罪 剧情
//导演: 吕克·贝松 Luc Besson 主演: 让·雷诺 Jean Reno / 娜塔莉·波特曼 ...<br> 1994 / 法国 美国 / 剧情 动作 犯罪
String ratingNum = item.getElementsByClass("rating_num").get(0).text();
String quote = "";
if (item.getElementsByClass("quote").size() > 0) {
quote = item.getElementsByClass("quote").get(0).text();
}
System.out.println("电影名次: " + num);
System.out.println("电影标题: " + title);
System.out.println(bd);
System.out.println("电影评分: " + ratingNum);
System.out.println("电影简评: " + quote);
System.out.println("============================================");
}
}
}
}
效果: