2、ip抓取第二阶段需求:
根据第一阶段抓取到的数据,向另一个IP查询网站、发送请求到数据查询中心http://199604.com/ip/,获取返回的数据,并抓取其中ip相关属性的数据并保存。
3、程序文档分析:
3.1根据已有的数据去http://199604.com/ip/, ip查询网站发送请求方法为requestPost();
并将返回的html文本存放在HTJF.txt;(利用分页查询SQL语句)每次取出100条,然后100条记录循环发送请求
3.2对接收的文本进行解析、过滤
第一次过滤:过滤完毕
第二次过滤:过滤完毕存进IpOperator1.txt
3.3对已过滤过的IP数据进行处理
3.3.1执行过程中遇到抓取数据不完整的IP,将它写入BugIp.txt文本
3.3.2合法的Ip段,存入ipdata数据表
3.3.3有问题Ip(即开始ip查询的数据与结束IP的数据不一致)存放到ipspecial数据表。
4、程序性能描述:
第一阶段抓到的数据:共3594条
合法ip:3143条
有问题Ip:450条
丢失数据:1条
全程跑完历时:80分钟。期间抛出一次异常。
5、第二价段工作已完毕(耗时2天)
IpDemo2.java
package com.htjf.ip2;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.helper.StringUtil;
import com.mysql.jdbc.StringUtils;
import com.htjf.ip.IpModel;
/**
* @author Administrator
*
*/
public class IpDemo2 {
/**
* @param args
*/
public static void main(String args[]) {
System.out.println("程序入口");
SqlData sqlData = new SqlData();
int num = sqlData.findIpCount();
// int k=0;
int k = 3339;
while (k < num) {
List<IpModel> iplist = sqlData.findIp(k, 100);
int j = 0;
for (IpModel ipmodel : iplist) {
System.out.println("===" + j);
IpData ipdata = new IpData();
IpSpecial ipSpecial = new IpSpecial();
j++;
String startIp = ipmodel.getStartIp();
System.out.println("startIp:" + startIp);
try {
Map<String, String> ipmap = requestPost(startIp);
ipdata.setIpId(ipmap.get("ipId"));
ipdata.setStartIp(ipmap.get("startIp"));
ipdata.setCountry(ipmap.get("country"));
ipdata.setProvince(ipmap.get("province"));// 省份
ipdata.setCity(ipmap.get("city"));// 城市
ipdata.setOperator(ipmap.get("operator"));// 运营商
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
String endIp = ipmodel.getEndIp().trim();
System.out.println("endIp:" + endIp);
try {
Map<String, String> ipmap2 = requestPost(endIp);
ipSpecial.setIpId(ipmap2.get("ipId"));
ipSpecial.setEndIp(ipmap2.get("startIp"));// /结束ip
ipSpecial.setCountry(ipmap2.get("country"));
ipSpecial.setProvince(ipmap2.get("province"));// 省份
ipSpecial.setCity(ipmap2.get("city"));// 城市
ipSpecial.setOperator(ipmap2.get("operator"));// 运营商
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/*
* if(StringUtil.isBlank(ipdata.getCity())){ ipdata.setCity("");
* }else if (StringUtil.isBlank(ipSpecial.getCity())) {
* ipdata.setCity(""); }
*/
if (ipdata.getCity().equals(ipSpecial.getCity())) {
ipdata.setEndIp(ipSpecial.getEndIp());
sqlData.insertIp(ipdata);// /System.out.println("---数据添加---");
} else {
ipSpecial.setStartIp(ipdata.getStartIp());
sqlData.insertIpSpecial(ipSpecial);
System.out.println("---特殊IP---");
}
}
k = k + iplist.size();
}
}
/**
* @param ipString
* @throws IOException
* 发送请求
*/
public static Map<String, String> requestPost(String ipString)
throws IOException {
URL url = new URL("http://199604.com/ip/");
URLConnection connection = url.openConnection();
/**
* 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
* 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
*/
connection.setConnectTimeout(500000);
connection.setDoOutput(true); // 是否向服务器发送数据
connection.setReadTimeout(300000);
/**
* 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
*/
OutputStreamWriter out = new OutputStreamWriter(
connection.getOutputStream(), "8859_1");
out.write("ip=" + ipString + "&action=2"); // 向页面传递数据。post的关键所在!
// out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在!
// remember to clean up
out.flush();
out.close();
/**
* 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
* text/plain Content-type: application/x-www-form-urlencoded
* Content-length: 99 username=bob password=someword
*/
// 一旦发送成功,用以下方法就可以得到服务器的回应:
String sCurrentLine;
String sTotalString;
sCurrentLine = "";
sTotalString = "";
InputStream l_urlStream;
l_urlStream = connection.getInputStream();// 获取返回的Html内容
// 传说中的三层包装阿!
BufferedReader l_reader = new BufferedReader(new InputStreamReader(
l_urlStream));
String html_regex = "<(.[^>]*)>";// /过滤标签的规则
Pattern p = Pattern.compile(html_regex);// 将规则封装成对象
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://IpHTML.txt")));
while ((sCurrentLine = l_reader.readLine()) != null) {
sCurrentLine = sCurrentLine.replaceAll("<tr>", "ipOperator:");
sCurrentLine = sCurrentLine.replaceAll("</td>", ",");
sCurrentLine = sCurrentLine.replaceAll(html_regex, "");
bufw.write(sCurrentLine);
bufw.newLine();// /换行
bufw.flush();// 刷新
}
bufw.close();
System.out.println("第一次过滤完毕,开始下一轮过滤");
String ipstr = saveIPOperator();
System.out.println("第一次过滤完毕,开始下一轮过滤");
Map<String, String> ipmap = saveIPOperator2(ipstr, ipString);
return ipmap;
}
/**
* @return
* @throws IOException
* 过滤一
*/
public static String saveIPOperator() throws IOException {
BufferedReader bufr = new BufferedReader(new InputStreamReader(
new FileInputStream("E://IpHTML.txt")));
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://IpOperator1.txt")));
String ip_regex1 = "ipOperator:";// /IP的匹配规则
int k = 1;
String ipstr = "";
String line = null;
while ((line = bufr.readLine()) != null) {
Pattern p = Pattern.compile(ip_regex1);// 将规则封装成对象
Matcher m = p.matcher(line);// 一行一行地进行匹配
while (m.find()) {
if (k == 2) {
Pattern p2 = Pattern.compile("\\s*|\t|\r|\n");
Matcher m2 = p2.matcher(line);
String line2 = m2.replaceAll("");
ipstr = line2;
bufw.write(line2);
bufw.newLine();// /换行
bufw.flush();// 刷新
}
k++;
}
}
bufw.close();
return ipstr;
}
/**
* @param ipstr
* @throws IOException
*/
public static Map<String, String> saveIPOperator2(String ipstr,
String ipString) throws IOException {
String iparray[] = new String[2];
// IpData ipdata=new IpData();
iparray = ipstr.split("\\:");
String ipos[] = iparray[1].split(",");
/*
* for(int i=0;i<ipos.length;i++){ System.out.println("===ip:"+ipos[i]);
* }
*/
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://BugIp.txt", true)));
Map<String, String> ipmap2 = new HashMap<String, String>();
ipmap2.put("Sip", "");
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss");
String ipId = getRandomString(14) + sdf.format(new Date());
Map<String, String> ipmap = new HashMap<String, String>();
ipmap.put("ipId", ipId);
ipmap.put("startIp", ipString);
try {
ipmap.put("country", ipos[1]);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
ipmap.put("country", "");
ipmap2.put("Sip", ipString);
}
try {
ipmap.put("province", ipos[2]);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
ipmap.put("province", "");
ipmap2.put("Sip", ipString);
}
try {
ipmap.put("city", ipos[3]);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
ipmap.put("city", "");
ipmap2.put("Sip", ipString);
}
try {
ipmap.put("county", ipos[4]);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
ipmap.put("county", "");
}
try {
ipmap.put("operator", ipos[5]);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
ipmap.put("operator", "");
ipmap2.put("Sip", ipString);
}
System.out.println("----------" + ipmap2.get("Sip"));
bufw.write(ipmap2.get("Sip"));
bufw.newLine();
bufw.flush();
bufw.close();
/*
* ipdata.setIpId(ipId); ipdata.setStartIp(ipos[0]);
* ipdata.setCountry(ipos[1]); ipdata.setProvince(ipos[2]);//省份
* ipdata.setCity(ipos[3]);//城市 ipdata.setOperator(ipos[5]);//运营商
*/
return ipmap;
}
/**
* @param length
* @return 生成随机数
*/
public static String getRandomString(int length) { // length表示生成字符串的长度
String base = "abcdefghijklmnopqrstuvwxyz0123456789";
Random random = new Random();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < length; i++) {
int number = random.nextInt(base.length());
sb.append(base.charAt(number));
}
return sb.toString();
}
}// //
class SqlData {
public static String username;
public static String password;
public static Connection connection;
public static PreparedStatement ps;
// //构造函数
public SqlData() {
String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull";
String username = "root";
String password = "";
// 加载驱动程序以连接数据库
try {
Class.forName("com.mysql.jdbc.Driver");
connection = DriverManager.getConnection(url, username, password);
}
// 捕获加载驱动程序异常
catch (ClassNotFoundException cnfex) {
System.err.println("装载 JDBC/ODBC 驱动程序失败");
cnfex.printStackTrace();
}
// 捕获连接数据库异常
catch (SQLException sqlex) {
System.err.println("无法连接数据库");
sqlex.printStackTrace();
}
}
/**
* @param ipModel
* @return 查询 数据
*/
public int findIpCount() {
// java.util.List<IpModel> list = new ArrayList<IpModel>();
int num = 0;
try {
ps = connection.prepareStatement("select count(*) from iptable");
ResultSet rs = ps.executeQuery();
rs.next();
num = rs.getInt(1);
// ps.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("====count:" + num);
return num;
}
/**
* @param ipModel
* @return 查询 数据
*/
public List<IpModel> findIp(int offset, int amount) {
java.util.List<IpModel> listiptable = new ArrayList<IpModel>();
try {
ps = connection.prepareStatement("select * from iptable limit ?,?");
ps.setInt(1, offset);
ps.setInt(2, amount);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
IpModel ipmodel2 = new IpModel();
ipmodel2.setIpId(rs.getString("ip_id"));
ipmodel2.setStartIp(rs.getString("startIp"));
System.out.println("iptable:" + rs.getString("startIp"));
ipmodel2.setEndIp(rs.getString("endIp"));
System.out.println("iptable:" + rs.getString("endIp"));
listiptable.add(ipmodel2);
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return listiptable;
}
/**
* @param ipModel
* 添加数据到ipdata数据表,符合要求的Ip
*/
public void insertIp(IpData ipData) {
SqlData ipsql = new SqlData();
List<IpData> list = ipsql.findIpData(ipData);
if (list.size() > 0) {
System.out.println("已存在有数据");
} else {
try {
ps = connection
.prepareStatement("insert into ipdata (ip_id,country,province,city,county,operator,startIp,endIp) values (?,?,?,?,?,?,?,?)");
/*
* SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss");
* String ipId=sdf.format(new Date());
*/
ps.setString(1, ipData.getIpId());
ps.setString(2, ipData.getCountry());
ps.setString(3, ipData.getProvince());
ps.setString(4, ipData.getCity());
ps.setString(5, ipData.getCounty());
ps.setString(6, ipData.getOperator());
ps.setString(7, ipData.getStartIp());
ps.setString(8, ipData.getEndIp());
ps.executeUpdate();
System.out.println("------记录插入成功------");
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* @return 查询IPData,符合要求的IP的数据表
*/
public List<IpData> findIpData(IpData ipData11) {
java.util.List<IpData> list = new ArrayList<IpData>();
try {
ps = connection
.prepareStatement("select * from ipdata where startIp=? and endIp=?");
ps.setString(1, ipData11.getStartIp());
ps.setString(2, ipData11.getEndIp());
ResultSet rs = ps.executeQuery();
IpData ipData21 = new IpData();
while (rs.next()) {
ipData21.setIpId(rs.getString("ip_id"));
ipData21.setStartIp(rs.getString("startIp"));
ipData21.setEndIp(rs.getString("endIp"));
list.add(ipData21);
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
/**
* @param ipModel
* 添加数据到ipdata数据表,符合要求的Ip
*/
public void insertIpSpecial(IpSpecial ipData) {
SqlData ipsql = new SqlData();
List<IpSpecial> list = ipsql.findIpSpecial(ipData);
if (list.size() > 0) {
System.out.println("已存在有数据");
} else {
try {
ps = connection
.prepareStatement("insert into ipspecial (ip_id,country,province,city,county,operator,startIp,endIp) values (?,?,?,?,?,?,?,?)");
/*
* SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss");
* String ipId=sdf.format(new Date());
*/
ps.setString(1, ipData.getIpId());
ps.setString(2, ipData.getCountry());
ps.setString(3, ipData.getProvince());
ps.setString(4, ipData.getCity());
ps.setString(5, ipData.getCounty());
ps.setString(6, ipData.getOperator());
ps.setString(7, ipData.getStartIp());
ps.setString(8, ipData.getEndIp());
ps.executeUpdate();
System.out.println("-------特殊Ip插入成功------");
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* @return 查询IpSpecial,特殊的IP
*/
public List<IpSpecial> findIpSpecial(IpSpecial ipSpecial) {
java.util.List<IpSpecial> list = new ArrayList<IpSpecial>();
try {
ps = connection
.prepareStatement("select * from ipspecial where startIp=? and endIp=?");
ps.setString(1, ipSpecial.getStartIp());
ps.setString(2, ipSpecial.getEndIp());
ResultSet rs = ps.executeQuery();
IpSpecial ipSpecial2 = new IpSpecial();
while (rs.next()) {
ipSpecial2.setIpId(rs.getString("ip_id"));
ipSpecial2.setStartIp(rs.getString("startIp"));
ipSpecial2.setEndIp(rs.getString("endIp"));
list.add(ipSpecial2);
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
}