爬虫

欢迎访问我的网站
 

 

 

通过webmagic 将显摆网的数据爬到数据库中

package main.java.com.dbyl.tests.crawler;


import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.sql.Date;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.LinkedHashSet;
import java.util.List;


import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.mysql.jdbc.Connection;


import main.java.com.dbyl.libarary.utils.DatabaseUtils;
import main.java.com.dbyl.libarary.utils.StringTools;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
 * 显摆网爬虫
 * @author Mr Chen
 *
 */
public class xbCrawler implements PageProcessor {


private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
private static WebClient driver;
static String code;
static Connection conn;


@Override
public void process(Page page) {
System.out.println("==============================================================");
System.out.println("                              Start                           ");
System.out.println("==============================================================");

page.putField("name", page.getHtml().xpath("//div[@class='xiangqingrightbox1left-a']/text()").toString());
page.putField("adress", page.getHtml().xpath("//div[@class='xiangqingrightbox2-b']/text()").toString());
//page.putField("date", page.getHtml().xpath("//div[@class='xiangqingrightbox2-b']/text()").toString());
if (null == page.getResultItems().get("name")) {
// skip this page
page.setSkip(true);
}


if (null == page.getResultItems().get("adress")) {
// skip this page
page.setSkip(true);
}


String name = page.getResultItems().get("name");
String adress = page.getResultItems().get("adress");
//SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MMdd");
try {
name = new String(name.getBytes(), "utf-8");
adress = new String(adress.getBytes(), "utf-8");
} catch (UnsupportedEncodingException e2) {
e2.printStackTrace();
}
//String tempDate = page.getResultItems().get("date");
System.out.println("name: " + name + "===>" + adress+"\n");
//Date date = null;
/* try {
if (!StringTools.isNullOrEmpty(tempDate.trim())) {
date = new Date(sdf.parse(tempDate.trim()).getTime());
} else {
date = new Date(System.currentTimeMillis());
}
} catch (ParseException e1) {
e1.printStackTrace();
}*/
try {
System.out.println("==============================================================");
System.out.println("                    Insert To Database                        ");
System.out.println("==============================================================");
//insertToDatabase(conn,StringTools.chineseToUnicode(name.trim()), price, cdate);
insertToDatabase(conn,(name.trim()), (adress.trim()));
// writeToTXT(name.trim()+","+adress.trim()+","+tempDate.trim()+"\n");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}


System.out.println("==============================================================");
System.out.println("                             The End                          ");
System.out.println("==============================================================");
}


@Override
public Site getSite() {
return site;
}


public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
conn = (new DatabaseUtils.Builder().setHost("localhost").setDbName("study").setUser("study")
.setPassword("123").builder()).getConection();
for (String code : getAllLinks()) {
String url = "http://www.xbjob.com/position/detail?id=" + code;
System.out.println("=====>" + url);

Spider.create(new xbCrawler()).addUrl(url).thread(1)
.addPipeline(new JsonFilePipeline("/Volumes/Transcend/Document/workspace/Demo/logs")).run();


}


}


/**获得id 并跳转到下一页
* @author young
* @return
* @throws IOException
*/
public static LinkedHashSet<String> getAllLinks() throws IOException {
driver = new WebClient(BrowserVersion.CHROME);
HtmlPage page = driver.getPage("http://www.xbjob.com/position");
LinkedHashSet<String> map = new LinkedHashSet<String>();
        int p=2;//title上面的数字
        //#selectPositionList > div.s_content > div.s_contentbox.active
DomElement button = page.getFirstByXPath("//div[@class='Pagination myself' and not(@disabled)]/a[@title="+p+"]");
List<?> target = page.getByXPath("//div[@class='s-contentleft1']/a");
        
for (Object link : target) {
code = getCode(link.toString());
map.add(code);
}
int i = 0;
while (null != page) {
i++;
System.out.println("============>" + i);
button = page.getFirstByXPath("//div[@class='Pagination myself' and not(@disabled)]/a[@title="+p+"]");
if (null == button) {
break;
} else {
if(i<2)//爬3页数据
{
page = button.click();
p++;
target = null;
target = page.getByXPath("//div[@class='s-contentleft1']/a");
for (Object link : target) {
code = getCode(link.toString());
map.add(code);
  }
}
else{
break;
}
}
}


System.out.println(map.size());
return map;


}


/**
* @author young
* @param source
* @return
*/
public static String getCode(String source) {
if (null != source) {
String temp = StringTools.getMatch(source, "(\\d+)");
System.out.println("id:"+temp);
return temp;
} else {
return null;
}


}


/**
* @author young
* @param conn
* @param name
* @param price
* @param cdate
* @throws ClassNotFoundException
* @throws SQLException
*/
public static void insertToDatabase(Connection conn, String name, String adress)
throws ClassNotFoundException, SQLException {


//Date date = new Date(System.currentTimeMillis());
PreparedStatement ps = conn.prepareStatement("INSERT into xbjob(name,adress) values (?, ?)");


ps.setString(1, name);
ps.setString(2, adress);
//ps.setDate(3, date);
ps.executeUpdate();
System.out.println("INSERT into xbjob(name,adress) values " + name + adress);


}
public void writeToTXT(String message) throws IOException {
BufferedWriter bf = null;
try {
// set true ,avoid
bf = new BufferedWriter(new FileWriter("report1.csv", true));
bf.write(message);
bf.flush();


} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
bf.close();
}




}






}

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值