1.将网上爬到的数据存到数据库中
import com.mysql.jdbc.StringUtils;
import org.Movie;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.security.PublicKey;
import java.util.ArrayList;
import java.util.List;
public class MoviePerformerProcess implements PageProcessor {
public String id;
public String getURL(String id) {
String URL = "https://www.1905.com/mdb/film/" + id + "/performer/?fr=mdbypsy_dh_yzry";
this.id=id;
return URL;
}
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
List<Selectable> list = page.getHtml().css("div.secPage-actors").nodes();
Movie movie = new Movie();
String dir="";
String actor="";
String wirter="";
for (Selectable i : list
) {
String author = i.xpath("//div/h3/text()").toString();
if ("导演".equals(author)) {
String[] a = i.toString().split("alt=\"");
for (int j = 1; j < a.length; j++) {
dir=dir+a[j].split("\"></a> ")[0]+";";
}
} else if ("编剧".equals(author)) {
String[] a = i.toString().split("alt=\"");
for (int j = 1; j < a.length; j++) {
wirter=wirter+a[j].split("\"></a> ")[0]+";";
}
} else if ("演员".equals(author)) {
String[] a = i.toString().split("alt=\"");
for (int j = 1; j < a.length; j++) {
actor=actor+a[j].split("\"></a> ")[0]+";";
}
}
}
movie.setMovie_dir(dir);
movie.setMovie_actor(actor);
movie.setMovie_wirter(wirter);
movie.setMovie_id(id);
page.putField("Movie", movie);
}
@Override
public Site getSite() {
return site;
}
}
import C.DataSourceFactory;
import org.Movie;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.List;
public class MoviePerformerPipeline implements Pipeline {
public void process(ResultItems resultItems, Task task) {
Movie movie=resultItems.get("Movie");
DataSource ds= DataSourceFactory.getDataSource();
Connection conn=null;
try {
System.out.println("开始保存");
conn= ds.getConnection();
PreparedStatement pst=conn.prepareStatement("update scott.movie set director=?,writer=?,actor=? where id=?;");
pst.setString(1,movie.getMovie_dir());
pst.setString(2,movie.getMovie_wirter());
pst.setString(3,movie.getMovie_actor());
pst.setString(4,movie.getMovie_id());
pst.executeUpdate();
} catch (SQLException e) {
System.out.println("保存失败");
e.printStackTrace();
}
if(conn!=null){
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
import C.DataSourceFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
public class MoviePerformerSpider {
public static void start(String id){
MoviePerformerProcess moviePerformerProcess=new MoviePerformerProcess();
Spider.create(moviePerformerProcess)
.addUrl(moviePerformerProcess.getURL(id))
.addPipeline(new MoviePerformerPipeline())
.run();
}
public static void main(String[] args) throws SQLException {
DataSource ds= DataSourceFactory.getDataSource();
Connection conn=null;
ResultSet resultSet = null;
try {
conn= ds.getConnection();
PreparedStatement pst=conn.prepareStatement("select id from movie");
resultSet=pst.executeQuery();
} catch (SQLException e) {
e.printStackTrace();
}
while(true){
try {
if (!resultSet.next()) break;
String id=resultSet.getString("id");
start(id);
} catch (SQLException e) {
e.printStackTrace();
}
}
conn.close();
}
}
2.最后的结果