爬虫抓取入库
String url ="http://www.tmtpost.com/";
Connection conn = HttpPhtomjs.getMysqlCon();Document document = HttpUtil.getPageContent(url, "gb2312");
System.out.println(document.text());
Elements lists = document.select("div[class] ul[class=article-list] li");
System.out.println(lists.size());
for(int i=0; i<lists.size(); i++)
{
String title = lists.get(i).select("h3 a").text();
String newsurl = lists.get(i).select("h3 a").attr("href");
String time = lists.get(i).select("div[class=info]").text();
String abs= lists.get(i).select("p[class=summary]").text();
String tag = lists.get(i).select("div[class=tag]").toString();
boolean flag = false;
String sql = "select * from kenews where url = '" + newsurl + "'";
Statement stmt;
try {
stmt = conn.createStatement();
java.sql.ResultSet rs = stmt.executeQuery(sql);
int num0 = 0;
while (rs.next()) {
num0++;
}
if (num0 > 0) {
System.out.println(newsurl);
flag = true;
continue;
}
} catch (SQLException e1) {
e1.printStackTrace();
}
Date date=new Date();
DateFormat format=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String ctime=format.format(date);
String Sql = "insert into technews(title,abstract,url, content,source,happentime,crawltime,tag) values('" + title + "','" + abs + ""
+ "','" + "http://www.tmtpost.com/"+newsurl + "','" + "" + "'" + ",'钛媒体','"+time+"','"+ctime+"','"+tag+"')";
System.out.print(Sql);
try {
Statement stmt1 = conn.createStatement();
int res1 = stmt1.executeUpdate(Sql);
// System.out.println(Sql);
} catch (Exception e) {
e.printStackTrace();
}