1、工作的背景及意义:
由于经常要查找IP的运营商等相关属性,数据量不多的情况下手动查找还比较好,但是在数据量超过几百,几千,这样手动查找就比较费力了,这样机械的工作交给计算机处理最合适了。没有必要人为的机械性查找。
2、ip抓取第一阶段需求:
从这个链接http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest,抓取含有CN、ipv4的IP数据,再发、发送请求到数据查询中心http://wq.apnic.net/apnic-bin/whois.pl,获取返回的数据,并抓取其中ip相关属性的数据并保存。
3、程序文档分析:
3.1抓取(http:ftp……)连接下txt文本(含有CN\ipv4)的数据,将抓取到的数据存进HTJF.txt。【通过程序中的getmail()和savetxt()方法完成】
3.2发送请求到Ip查询网站,并接收返回的html文本(通过程序中的readtxt()\testpost方法完成)。
3.3对接收的文本进行解析、过滤
第一次过滤:过滤完毕存进IpHTML.txt
第二次过滤:过滤完毕存进Ip1.txt
第三次过滤:过滤完毕存进Ip2.txt
最后入库:把IP的相关属性封装成对象存入数据前先遍历存不存在该IP段,
存在:不执行
不存在:执行JDBC操作
4、程序性能描述:
从7万多条数据抓到3千几条数据,
全程跑完历时:50分钟。期间抛出一次异常。
代码如下:
package com.htjf.ip;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
/**
* @author Qixuan
*
*/
public class IpDemo {
/**
* @param args
* 程序入口
* @throws IOException
*/
public static void main(String args[]) throws IOException {
// Document doc=null;
// doc =
// Jsoup.connect("http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest").timeout(1000000).get();
try {
System.out.println("爬取");
// List<String> list=getMail();
System.out.println("保存");
// savetxt(list);
System.out.println("发送请求");
readtxt();// 发送请求并进行多个规则过滤
/*
* testPost("112.46.78.4");//发送请求, saveLastIP();//规则1 IpModel
* ipModel=saveLastIP2();//规则2
*
* MySql ipsql=new MySql(); ipsql.insertIp(ipModel);//存进数据库
*/
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}// /
/*
* 1、读取文件 2、对读取的数据进行规则匹配,从中获取符合规则的数据 3、将符合规则的数据储存到集合中
*/
public static List<String> getMail() throws Exception {
// /从本地文件中爬
// BufferedReader br=new BufferedReader(new
// FileReader("d:\\mail.html"));
// /从网络文件中爬
URL url = new URL(
"http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest");
BufferedReader br = new BufferedReader(new InputStreamReader(
url.openStream()));
String mail_regex = "CN\\|ipv4";// /关键字的匹配规则
Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象
List<String> list = new ArrayList<String>();
String line = null;
while ((line = br.readLine()) != null) {
Matcher m = p.matcher(line);// 一行一行地进行匹配
while (m.find()) {
// m.group()找到就放进组里
// list.add(m.group());
list.add(line);
}
}
return list;
}
/**
* @param list
* 将爬取到的含有CN、ipv4的数据存到HTJF.txt中
* @throws FileNotFoundException
*/
public static void savetxt(List<String> listarray)
throws FileNotFoundException {
/* 将A文件中的内容,保存到B文件中 */
// BufferedReader bufr=new BufferedReader(new InputStreamReader(new
// FileInputStream("D://xuan.txt")));
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://HTJF.txt")));
try {
System.out.println("有多少条记录:" + listarray.size());
System.out.println("爬到的资源");
/*
* List<String> cnList=new ArrayList<String>(); List<String>
* ipv4List=new ArrayList<String>(); List<String> ipList=new
* ArrayList<String>();
*/
String[] str = new String[10];
for (String mail : listarray) {
System.out.println("====>" + mail);
str = mail.split("\\|");
String line = null;
// int length=str.length;
bufw.write(str[3]);
bufw.write(",");
bufw.write(str[2]);
bufw.write(",");
bufw.write(str[1]);
bufw.newLine();// /换行
bufw.flush();// 刷新
}
bufw.close();//
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}// ///
/**
* 读取文本,并Post到ip运营商查询网站中
*
* @throws IOException
*/
public static void readtxt() throws IOException {
BufferedReader bufr = new BufferedReader(new InputStreamReader(
new FileInputStream("E://HTJF.txt")));
String[] array = new String[10];
Document doc = null;
String line = null;
while ((line = bufr.readLine()) != null) {
array = line.split("\\,");
System.out.println("ip地址:" + array[0]);
String searchtextIp = array[0];
testPost(searchtextIp);// 发送请求并过滤,调用3000几次
}
}
/**
* @throws IOException
* 使用java程序模拟页面发送http的post请求,并过滤标签
*/
public static void testPost(String iptest) throws IOException {
/**
* 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using
* java.net.URL and //java.net.URLConnection
*
* 使用页面发送请求的正常流程:在页面http://www.faircanton.com/message/loginlytebox.
* asp中输入用户名和密码,然后按登录,
* 跳转到页面http://www.faircanton.com/message/check.asp进行验证 验证的的结果返回到另一个页面
*
* 使用java程序发送请求的流程:使用URLConnection向http://www.faircanton.com/message/
* check.asp发送请求 并传递两个参数:用户名和密码 然后用程序获取验证结果
*/
URL url = new URL("http://wq.apnic.net/apnic-bin/whois.pl");
URLConnection connection = url.openConnection();
/**
* 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
* 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
*/
connection.setDoOutput(true);
/**
* 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
*/
OutputStreamWriter out = new OutputStreamWriter(
connection.getOutputStream(), "8859_1");
out.write("searchtext=" + iptest + "&form_type=advanced"); // 向页面传递数据。post的关键所在!
// out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在!
// remember to clean up
out.flush();
out.close();
/**
* 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
* text/plain Content-type: application/x-www-form-urlencoded
* Content-length: 99 username=bob password=someword
*/
// 一旦发送成功,用以下方法就可以得到服务器的回应:
String sCurrentLine;
String sTotalString;
sCurrentLine = "";
sTotalString = "";
InputStream l_urlStream;
l_urlStream = connection.getInputStream();
// 传说中的三层包装阿!
String mail_regex = "<(.[^>]*)>";// /过滤标签的规则
Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象
BufferedReader l_reader = new BufferedReader(new InputStreamReader(
l_urlStream));
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://IpHTML.txt")));
while ((sCurrentLine = l_reader.readLine()) != null) {
// sTotalString += sCurrentLine + "/r/n";
// Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配
// Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配
sCurrentLine = sCurrentLine.replaceAll(mail_regex, "").trim();
bufw.write(sCurrentLine);
bufw.newLine();// /换行
bufw.flush();// 刷新
}
bufw.close();
// System.out.println("页面相应的内容");
// System.out.println(sTotalString);
System.out.println("第一次过滤完毕,开始下一轮过滤");
saveLastIP();// 第二次过滤
}// ///
/**
* @throws IOException
* 匹配ip所需要的字段1
*/
public static void saveLastIP() throws IOException {
BufferedReader bufr = new BufferedReader(new InputStreamReader(
new FileInputStream("E://IpHTML.txt")));
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://Ip1.txt")));
String[] mail_regex1 = { "inetnum:", "netname:", "descr:", "country:" };// /IP的匹配规则
String line = null;
while ((line = bufr.readLine()) != null) {
for (int i = 0; i < mail_regex1.length; i++) {
Pattern p = Pattern.compile(mail_regex1[i]);// 将规则封装成对象
Matcher m = p.matcher(line);// 一行一行地进行匹配
while (m.find()) {
Pattern p2 = Pattern.compile("\\s*|\t|\r|\n");
Matcher m2 = p2.matcher(line);
String line2 = m2.replaceAll("");
bufw.write(line2);
bufw.newLine();// /换行
bufw.flush();// 刷新
}
}
}
bufw.close();
System.out.println("第二次过滤完毕,开始下一轮过滤");
saveLastIP2();// 第三次过滤
}
/**
* @throws IOException
* 匹配ip所需要的字段
*/
public static void saveLastIP2() throws IOException {
BufferedReader bufr = new BufferedReader(new InputStreamReader(
new FileInputStream("E://Ip1.txt")));
BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("E://Ip2.txt")));
String[] array = new String[2];
String[] temp = new String[2];
IpModel ipModel = new IpModel();
int k = 1;
String line = null;
while ((line = bufr.readLine()) != null) {
array = line.split("\\:");
if ("inetnum".equals(array[0])) {
temp = array[1].split("\\-");
ipModel.setStartIp(temp[0]);
ipModel.setEndIp(temp[1]);
} else if ("netname".equals(array[0])) {
if (array[1].indexOf("-") > 0) {
temp = array[1].split("\\-");
ipModel.setProvince(temp[0]);
ipModel.setAttribution(temp[1]);
} else {
ipModel.setProvince("");
ipModel.setAttribution(array[1]);
}
} else if ("descr".equals(array[0])) {
if (k == 1) {
try {
if (StringUtil.isBlank(array[1])) {
ipModel.setOperator("");
} else {
ipModel.setOperator(array[1]);
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("数组越界!");
e.printStackTrace();
}
} else if (k == 2) {
try {
if (StringUtil.isBlank(array[1])) {
ipModel.setOperator("");
} else {
ipModel.setOperator(array[1]);
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("数组越界!");
e.printStackTrace();
}
} else if (k == 3) {
try {
if (StringUtil.isBlank(array[1])) {
ipModel.setOperator("");
} else {
ipModel.setOperator(array[1]);
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("数组越界!");
e.printStackTrace();
}
} else if (k == 4) {
try {
if (StringUtil.isBlank(array[1])) {
ipModel.setOperator("");
} else {
ipModel.setOperator(array[1]);
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("数组越界!");
e.printStackTrace();
}
}
k++;
} else if ("country".equals(array[0])) {
ipModel.setCountry(array[1]);
}
}
bufw.write(ipModel.getCountry());
bufw.write(" ");// /换行
bufw.write(ipModel.getAttribution()); // 归属地
bufw.write(" ");// /换行
bufw.write(ipModel.getProvince());
bufw.write(" ");// /换行
bufw.write(ipModel.getOperator());// 运营商
bufw.write(" ");// /换行
bufw.write(ipModel.getStartIp());
bufw.write(" ");// /换行
bufw.write(ipModel.getEndIp());
bufw.newLine();// /换行
bufw.flush();// 刷新
bufw.close();
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss");
String ipId = getRandomString(14) + sdf.format(new Date());
ipModel.setIpId(ipId);
System.out.println("匹配完毕保存到数据库");
MySql ipsqlSql = new MySql();
ipsqlSql.insertIp(ipModel);
// return ipModel;
}
/**
* @param length
* @return 生成随机数
*/
public static String getRandomString(int length) { // length表示生成字符串的长度
String base = "abcdefghijklmnopqrstuvwxyz0123456789";
Random random = new Random();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < length; i++) {
int number = random.nextInt(base.length());
sb.append(base.charAt(number));
}
return sb.toString();
}
}// ///
class MySql {
public static String username;
public static String password;
public static Connection connection;
public static PreparedStatement ps;
// //构造函数
public MySql() {
String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull";
String username = "root";
String password = "";
// 加载驱动程序以连接数据库
try {
Class.forName("com.mysql.jdbc.Driver");
connection = DriverManager.getConnection(url, username, password);
}
// 捕获加载驱动程序异常
catch (ClassNotFoundException cnfex) {
System.err.println("装载 JDBC/ODBC 驱动程序失败");
cnfex.printStackTrace();
}
// 捕获连接数据库异常
catch (SQLException sqlex) {
System.err.println("无法连接数据库");
sqlex.printStackTrace();
}
}
/**
* @param ipModel
* private String country;//国家地区 private String province;//省份
* private String operator;//运营商 private String attribution;//归属地
* private String startIp;//起始Ip private String endIp;//结束Ip
*
*/
public void insertIp(IpModel ipModel) {
MySql ipsql = new MySql();
List<IpModel> list = ipsql.findIp(ipModel);
if (list.size() > 0) {
System.out.println("已存在有数据");
} else {
try {
ps = connection
.prepareStatement("insert into iptable (ip_id,country,province,operator,attribution,startIp,endIp) values (?,?,?,?,?,?,?)");
/*
* SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss");
* String ipId=sdf.format(new Date());
*/
ps.setString(1, ipModel.getIpId());
ps.setString(2, ipModel.getCountry());
ps.setString(3, ipModel.getProvince());
ps.setString(4, ipModel.getOperator());
ps.setString(5, ipModel.getAttribution());
ps.setString(6, ipModel.getStartIp());
ps.setString(7, ipModel.getEndIp());
ps.executeUpdate();
System.out.println("记录插入成功");
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* @param ipModel
* 更新update table set a=REPLACE(a,'1','2');
*/
public void updateIp(IpModel ipModel) {
try {
ps = connection
.prepareStatement("update iptable set(country=?,province=?,operator=?,attribution=?,startIp,endIp=?) where ip_id=?");
ps.setString(1, ipModel.getIpId());
ps.setString(2, ipModel.getCountry());
ps.setString(3, ipModel.getProvince());
ps.setString(4, ipModel.getOperator());
ps.setString(5, ipModel.getAttribution());
ps.setString(6, ipModel.getStartIp());
ps.setString(7, ipModel.getEndIp());
ps.executeUpdate();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* @param ipModel
* @return 查询
*/
public List<IpModel> findIp(IpModel ipModel) {
java.util.List<IpModel> list = new ArrayList<IpModel>();
try {
ps = connection
.prepareStatement("select * from iptable where startIp=? and endIp=?");
ps.setString(1, ipModel.getStartIp());
ps.setString(2, ipModel.getEndIp());
ResultSet rs = ps.executeQuery();
IpModel ipmodel = new IpModel();
while (rs.next()) {
ipmodel.setStartIp(rs.getString("ip_id"));
ipmodel.setStartIp(rs.getString("startIp"));
ipmodel.setStartIp(rs.getString("endIp"));
list.add(ipmodel);
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
}
package com.htjf.ip;
/**
* @author Qixuan
*
*/
public class IpModel {
private String ipId;
private String country;// 国家地区
private String province;// 省份
private String operator;// 运营商
private String attribution;// 归属地
private String startIp;// 起始Ip
private String endIp;// 结束Ip
public String getIpId() {
return ipId;
}
public void setIpId(String ipId) {
this.ipId = ipId;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getOperator() {
return operator;
}
public void setOperator(String operator) {
this.operator = operator;
}
public String getAttribution() {
return attribution;
}
public void setAttribution(String attribution) {
this.attribution = attribution;
}
public String getStartIp() {
return startIp;
}
public void setStartIp(String startIp) {
this.startIp = startIp;
}
public String getEndIp() {
return endIp;
}
public void setEndIp(String endIp) {
this.endIp = endIp;
}
}