欢迎访问我的网站
通过webmagic 将显摆网的数据爬到数据库中
package main.java.com.dbyl.tests.crawler;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.sql.Date;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.LinkedHashSet;
import java.util.List;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.mysql.jdbc.Connection;
import main.java.com.dbyl.libarary.utils.DatabaseUtils;
import main.java.com.dbyl.libarary.utils.StringTools;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 显摆网爬虫
* @author Mr Chen
*
*/
public class xbCrawler implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
private static WebClient driver;
static String code;
static Connection conn;
@Override
public void process(Page page) {
System.out.println("==============================================================");
System.out.println(" Start ");
System.out.println("==============================================================");
page.putField("name", page.getHtml().xpath("//div[@class='xiangqingrightbox1left-a']/text()").toString());
page.putField("adress", page.getHtml().xpath("//div[@class='xiangqingrightbox2-b']/text()").toString());
//page.putField("date", page.getHtml().xpath("//div[@class='xiangqingrightbox2-b']/text()").toString());
if (null == page.getResultItems().get("name")) {
// skip this page
page.setSkip(true);
}
if (null == page.getResultItems().get("adress")) {
// skip this page
page.setSkip(true);
}
String name = page.getResultItems().get("name");
String adress = page.getResultItems().get("adress");
//SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MMdd");
try {
name = new String(name.getBytes(), "utf-8");
adress = new String(adress.getBytes(), "utf-8");
} catch (UnsupportedEncodingException e2) {
e2.printStackTrace();
}
//String tempDate = page.getResultItems().get("date");
System.out.println("name: " + name + "===>" + adress+"\n");
//Date date = null;
/* try {
if (!StringTools.isNullOrEmpty(tempDate.trim())) {
date = new Date(sdf.parse(tempDate.trim()).getTime());
} else {
date = new Date(System.currentTimeMillis());
}
} catch (ParseException e1) {
e1.printStackTrace();
}*/
try {
System.out.println("==============================================================");
System.out.println(" Insert To Database ");
System.out.println("==============================================================");
//insertToDatabase(conn,StringTools.chineseToUnicode(name.trim()), price, cdate);
insertToDatabase(conn,(name.trim()), (adress.trim()));
// writeToTXT(name.trim()+","+adress.trim()+","+tempDate.trim()+"\n");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("==============================================================");
System.out.println(" The End ");
System.out.println("==============================================================");
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
conn = (new DatabaseUtils.Builder().setHost("localhost").setDbName("study").setUser("study")
.setPassword("123").builder()).getConection();
for (String code : getAllLinks()) {
String url = "http://www.xbjob.com/position/detail?id=" + code;
System.out.println("=====>" + url);
Spider.create(new xbCrawler()).addUrl(url).thread(1)
.addPipeline(new JsonFilePipeline("/Volumes/Transcend/Document/workspace/Demo/logs")).run();
}
}
/**获得id 并跳转到下一页
* @author young
* @return
* @throws IOException
*/
public static LinkedHashSet<String> getAllLinks() throws IOException {
driver = new WebClient(BrowserVersion.CHROME);
HtmlPage page = driver.getPage("http://www.xbjob.com/position");
LinkedHashSet<String> map = new LinkedHashSet<String>();
int p=2;//title上面的数字
//#selectPositionList > div.s_content > div.s_contentbox.active
DomElement button = page.getFirstByXPath("//div[@class='Pagination myself' and not(@disabled)]/a[@title="+p+"]");
List<?> target = page.getByXPath("//div[@class='s-contentleft1']/a");
for (Object link : target) {
code = getCode(link.toString());
map.add(code);
}
int i = 0;
while (null != page) {
i++;
System.out.println("============>" + i);
button = page.getFirstByXPath("//div[@class='Pagination myself' and not(@disabled)]/a[@title="+p+"]");
if (null == button) {
break;
} else {
if(i<2)//爬3页数据
{
page = button.click();
p++;
target = null;
target = page.getByXPath("//div[@class='s-contentleft1']/a");
for (Object link : target) {
code = getCode(link.toString());
map.add(code);
}
}
else{
break;
}
}
}
System.out.println(map.size());
return map;
}
/**
* @author young
* @param source
* @return
*/
public static String getCode(String source) {
if (null != source) {
String temp = StringTools.getMatch(source, "(\\d+)");
System.out.println("id:"+temp);
return temp;
} else {
return null;
}
}
/**
* @author young
* @param conn
* @param name
* @param price
* @param cdate
* @throws ClassNotFoundException
* @throws SQLException
*/
public static void insertToDatabase(Connection conn, String name, String adress)
throws ClassNotFoundException, SQLException {
//Date date = new Date(System.currentTimeMillis());
PreparedStatement ps = conn.prepareStatement("INSERT into xbjob(name,adress) values (?, ?)");
ps.setString(1, name);
ps.setString(2, adress);
//ps.setDate(3, date);
ps.executeUpdate();
System.out.println("INSERT into xbjob(name,adress) values " + name + adress);
}
public void writeToTXT(String message) throws IOException {
BufferedWriter bf = null;
try {
// set true ,avoid
bf = new BufferedWriter(new FileWriter("report1.csv", true));
bf.write(message);
bf.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
bf.close();
}
}
}