六个部分,,,四个类, ,一个依赖,,还有一个数据库
driver=com.mysql.cj.jdbc.Driver
url=jdbc:mysql://localhost:3306/webmagic?useUnicode=true&characterEncoding=utf-8&useSSL=false&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=UTC
username= root
password = 123456
create table t_news(
id int not null primary key auto_increment,
title varchar(225),
source varchar(50),
type varchar(225)
)charset utf8 collate utf8_general_ci;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.Date;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
public class JdbcUtil {
private static String driver;
private static String url;
private static String username;
private static String password;
static {// 静态方法块,加载驱动
InputStream is = JdbcUtil.class.getResourceAsStream("/driver.properties");
Properties prop = new Properties();
try {
prop.load(is);
} catch (IOException e1) {
e1.printStackTrace();
}
driver = prop.getProperty("driver");
url = prop.getProperty("url");
username = prop.getProperty("username");
password = prop.getProperty("password");
try {
Class.forName(driver);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public static void executeSQL(String sql) {
Connection conn = getConn();
PreparedStatement ps = null;
try {
ps = conn.prepareStatement(sql);
ps.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
} finally {
close(conn, ps, null);
}
}
private static Connection getConn() {
Connection conn = null;
try {
conn = DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
private static void close(Connection conn, Statement stat, ResultSet rs) {
if (rs != null)
try {
rs.close();
} catch (SQLException e) {
e.printStackTrace();
}
if (stat != null)
try {
stat.close();
} catch (SQLException e) {
e.printStackTrace();
}
if (conn != null)
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class mysqlPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
String title = resultItems.get("title"); //公司
String source = resultItems.get("source"); //薪资
String type = resultItems.get("type"); //职位
String sql = "INSERT INTO t_news "
+ "(title, source, type ) VALUES ( '" + //
title.replace("'", "\\\'") + "', '" + //
source.replace("'", "\\\'") + "', '" + //
type.replace("'", "\\\'") + "' );";
System.out.println(sql);
JdbcUtil.executeSQL(sql);
}
}
/**
* @author 你的名称
* @createTime 18-8-26
* @description 招聘网站 爬三项,,数量够了,到了文件夹里,一堆json文件,解决了一个网页只有一个结果的问题
*/
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
public class first implements PageProcessor{
private Site site = Site.me().setSleepTime(1000).setRetryTimes(3);
private int count=0;
@Override
public void process(Page page) {
if(page.getUrl().regex(".*position.*").match()){
page.putField("type",page.getHtml().xpath("//h1[@class='font-green']/text()").toString());
page.putField("source",page.getHtml().xpath("//div[@class='col-xs-9']/div/b/text()").toString());
page.putField("title",page.getHtml().xpath("//div[@class='col-xs-9']/div/a[1]/text()").toString());
count++;
System.out.println(count++);
}
// List<String> urls = page.getHtml().css("div.pagination").links().regex(".*?type=%.*").all();
List<String> urlss = page.getHtml().xpath("//a[@class='h4 name over-hide']/@href").all();
page.addTargetRequests(urlss);
// page.addTargetRequests(urls);
}
@Override
public Site getSite() {
return site;
}
public static void main(String args[]){
Spider.create(new first())
.addUrl("https://job.oschina.net/search?type=%E8%81%8C%E4%BD%8D%E6%90%9C%E7%B4%A2&key=&exp=0&edu=0&nat=1&city=%E5%85%A8%E5%9B%BD&p=")
.addPipeline(new ConsolePipeline())
.addPipeline(new mysqlPipeline())
.addPipeline(new FilePipeline("D:\\webmagic\\"))
.thread(4)
.run();
}
}
借鉴https://my.oschina.net/anxiaole/blog/783989