package com.newer.spider;
/**
* 电影对象
* @author houtt
*
*/
public class Film {
private int id;//排名
private String title;//标题
private double rating;//评分
private String info;//介绍
private String quote;//简评
private String path;//图片路径
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public double getRating() {
return rating;
}
public void setRating(double rating) {
this.rating = rating;
}
public String getInfo() {
return info;
}
public void setInfo(String info) {
this.info = info;
}
public String getQuote() {
return quote;
}
public void setQuote(String quote) {
this.quote = quote;
}
public String getPath() {
return path;
}
public void setPath(String path) {
this.path = path;
}
@Override
public String toString() {
return "Film [id=" + id + ", title=" + title + ", rating=" + rating + ", info=" + info + ", quote=" + quote
+ ", path=" + path + "]";
}
}
-----------------------------------------------------------------------------------------------------------
package com.newer.spider;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
//爬虫线程类(爬取一页电影资源)
public class Spider implements Runnable{
private String url;//链接
private List<Film> films;//电影列表
public Spider(String url, List<Film> films) {
super();
this.url = url;
this.films = films;
}
@Override
public void run() {
try {
//获取链接
Document doc = Jsoup.connect(url).timeout(10000).get();
//System.out.println(doc);
//解析数据
Elements elms = doc.select(".grid_view .item");//每个item就是一个电影对象
for (Element e : elms) {
Film film = new Film();
String id = e.select("em").text();
String path = e.select(".pic img").attr("src");//获取src属性
String title = e.select(".title").text();//文本
String quote = e.select(".quote .inq").text();
String rating = e.select(".star .rating_num").text();
String info = e.select(".bd p").first().text();
film.setId(Integer.parseInt(id));
film.setTitle(title);
film.setPath(path);
film.setQuote(quote);
film.setRating(Double.parseDouble(rating));
film.setInfo(info);
System.out.println(film.toString());
//存放数据
films.add(film);
//System.out.println(Thread.currentThread().getName() + " download " + id);
}
//System.out.println(Thread.currentThread() + " over");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
String url="https://movie.douban.com/top250?start=0&filter=";
List<Film> films = new ArrayList<>();
Thread th = new Thread(new Spider(url,films));
th.start();
}
}
-------------------------------------------------------------------------------------------------------------
package com.newer.spider;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
//爬虫项目启动类
public class App {
static List<Film> films = new ArrayList<>();
public static void main(String[] args) throws InterruptedException {
String url = "";// https://movie.douban.com/top250?start=0&filter=
for (int i = 0; i < 10; i++) {
int start = i * 25;
url = "https://movie.douban.com/top250?start=" + start + "&filter=";
System.out.println(url);
Thread th = new Thread(new Spider(url, films));
th.start();
th.join();
}
writeData("250.txt");
}
// 写入文件
public static void writeData(String filename) {
System.out.println("films size ==="+films.size());
BufferedWriter out = null;
try {
out = new BufferedWriter(new FileWriter(filename, true));//追加写
for(Film f:films){
out.write(f.toString());
out.newLine();
out.flush();
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}