1 mysql5.1 创建表
-- 创建表 id,开奖时间,
CREATE TABLE `ssq` (
`id` bigint(20) NOT NULL AUTO_INCREMENT,
`opendate` varchar(20) DEFAULT NULL,
`openseq` varchar(20) DEFAULT NULL,
`winningnum` varchar(40) DEFAULT NULL,
`winningred1` varchar(2) DEFAULT NULL,
`winningred2` varchar(2) DEFAULT NULL,
`winningred3` varchar(2) DEFAULT NULL,
`winningred4` varchar(2) DEFAULT NULL,
`winningred5` varchar(2) DEFAULT NULL,
`winningred6` varchar(2) DEFAULT NULL,
`winningblue` varchar(2) DEFAULT NULL,
`winningnumber` tinyint(10) DEFAULT NULL,
`province` varchar(60) DEFAULT NULL,
`secondwinnum` int(10) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `ssq_id_pk` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
2 抓取,抓取数据,插入表中。需要的jar包
commons-logging-1.1.1.jar
htmlparser.jar
httpclient-4.2.3.jar
httpcore-4.2.2.jar
mysql-connector-java-5.1.14.jar
原先的Httpclient 现在改名为HttpComponents,可去http://hc.apache.org/ 下载,htmlparser 可去http://sourceforge.net/projects/htmlparser/files/htmlparser/1.6/htmlparser1_6_20060610.zip/download 下载
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
public class DoubleBall {
HttpClient httpclient = new DefaultHttpClient();
public static void main(String[] args) {
Connection conn = null;
Statement statement = null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection(
"jdbc:mysql://localhost:3306/caipiao?rewriteBatchedStatements=true", "root", "root");
conn.setAutoCommit(false);
statement = conn.createStatement();
DoubleBall ball = new DoubleBall();
for (int i = 1; i < 75; i++) {
// System.out.println("-- 第" + i + "页双色球数据");
String url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_" + i
+ ".html";
String content = ball.getInternetConent(url);
ball.getDoubleBallData(content,statement);
}
statement.executeBatch();
conn.commit();
System.out.println("结束");
} catch (Exception e) {
System.out.println(e.getMessage());
}finally{
try {
statement.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
private void getDoubleBallData(String content,Statement statement) {
Parser parser = Parser.createParser(content, "UTF-8");
try {
NodeList nodeList = parser
.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof TableTag)//
return true;
return false;
}
});
if(nodeList.size()>0){
for(int i = 0; i < nodeList.size(); ++i){
if(nodeList.elementAt(i) instanceof TableTag){
TableTag tag = (TableTag) nodeList.elementAt(i);
TableRow[] rows = tag.getRows();
for (int j = 2; j < rows.length-1; ++j) { // 最后一个是分页信息,只要数据
String sql = "insert into ssq(opendate,openseq,winningnum,winningred1,winningred2,winningred3,winningred4,winningred5,winningred6,winningblue,winningnumber,province,secondwinnum) values(";
TableRow row = (TableRow) rows[j];
TableColumn[] columns = row.getColumns();
for (int k = 0; k < columns.length; ++k) {
String info = columns[k].toPlainTextString().trim();
if(k==0){
//System.out.print("日期:"+info+" ");
sql+="\'"+info+"\',";
}
if(k==1){
//System.out.print("期号:"+info+" ");
sql+="\'"+info+"\',";
}
if(k==2){
String[] sarr = info.split("\r\n ");
String serc = "";
for (int l = 0; l < sarr.length; l++) {
if(sarr.toString().trim().length()>0){
serc += sarr[l].trim()+" ";
}
}
sql+="\'"+serc.trim()+"\',";
String[] winningnums = serc.trim().split(" ");
for (int l = 0; l < winningnums.length; l++) {
sql+="\'"+winningnums[l]+"\',";
}
//System.out.print("中奖号码:"+serc+" ");
}
if(k==4){
//System.out.print("一等奖中奖注数:"+info.replace("\r\n", "")+" ");
if(info!=null && info.length()>3){
String[] firstwiner = info.split("\r\n ");
if(firstwiner.length>1){
sql+=""+firstwiner[0]+",";
sql+="\'"+firstwiner[1]+"',";
}
}else{
sql+="0,";
sql+="\'0\',";
}
}
if(k==5){
//System.out.print("二等奖中奖注数:"+info.replace("\n", "")+"\n");
if(info!=null && info.length()>0){
sql+=info.replace("\n", "");
}else{
sql+=0;
}
}
}
sql += ")";
statement.addBatch(sql);
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
private String getInternetConent(String url){
String content = "";
try {
// 生成一个请求对象
HttpGet httpget = new HttpGet(url);
httpget.setHeader("referer", "http://www.baidu.com/");
// 执行请求
HttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
if (entity != null) {
//响应内容的长度
//long length = entity.getContentLength();
//响应内容
content = EntityUtils.toString(entity);
}
httpget.abort();
} catch (Exception e) {
httpclient.getConnectionManager().shutdown();
}
return content;
}
}
OK 因为数据比较大,程序执行需要时间,静静等待。