抓取IP数据(一)

1、工作的背景及意义:

由于经常要查找IP的运营商等相关属性,数据量不多的情况下手动查找还比较好,但是在数据量超过几百,几千,这样手动查找就比较费力了,这样机械的工作交给计算机处理最合适了。没有必要人为的机械性查找。

2、ip抓取第一阶段需求:

从这个链接http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest,抓取含有CN、ipv4的IP数据,再发、发送请求到数据查询中心http://wq.apnic.net/apnic-bin/whois.pl,获取返回的数据,并抓取其中ip相关属性的数据并保存。

3、程序文档分析:

3.1抓取(http:ftp……)连接下txt文本(含有CN\ipv4)的数据,将抓取到的数据存进HTJF.txt。【通过程序中的getmail()和savetxt()方法完成】

3.2发送请求到Ip查询网站,并接收返回的html文本(通过程序中的readtxt()\testpost方法完成)。

3.3对接收的文本进行解析、过滤

第一次过滤:过滤完毕存进IpHTML.txt

第二次过滤:过滤完毕存进Ip1.txt

第三次过滤:过滤完毕存进Ip2.txt

最后入库:把IP的相关属性封装成对象存入数据前先遍历存不存在该IP段,

存在:不执行

不存在:执行JDBC操作

4、程序性能描述:

从7万多条数据抓到3千几条数据,

全程跑完历时:50分钟。期间抛出一次异常。


代码如下:

package com.htjf.ip;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;

/**
 * @author Qixuan
 * 
 */
public class IpDemo {

	/**
	 * @param args
	 *            程序入口
	 * @throws IOException
	 */
	public static void main(String args[]) throws IOException {
		// Document doc=null;
		// doc =
		// Jsoup.connect("http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest").timeout(1000000).get();
		try {
			System.out.println("爬取");
			// List<String> list=getMail();
			System.out.println("保存");
			// savetxt(list);
			System.out.println("发送请求");
			readtxt();// 发送请求并进行多个规则过滤

			/*
			 * testPost("112.46.78.4");//发送请求, saveLastIP();//规则1 IpModel
			 * ipModel=saveLastIP2();//规则2
			 * 
			 * MySql ipsql=new MySql(); ipsql.insertIp(ipModel);//存进数据库
			 */
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}// /

	/*
	 * 1、读取文件 2、对读取的数据进行规则匹配,从中获取符合规则的数据 3、将符合规则的数据储存到集合中
	 */

	public static List<String> getMail() throws Exception {
		// /从本地文件中爬
		// BufferedReader br=new BufferedReader(new
		// FileReader("d:\\mail.html"));

		// /从网络文件中爬
		URL url = new URL(
				"http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest");
		BufferedReader br = new BufferedReader(new InputStreamReader(
				url.openStream()));

		String mail_regex = "CN\\|ipv4";// /关键字的匹配规则
		Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象

		List<String> list = new ArrayList<String>();

		String line = null;
		while ((line = br.readLine()) != null) {
			Matcher m = p.matcher(line);// 一行一行地进行匹配
			while (m.find()) {
				// m.group()找到就放进组里
				// list.add(m.group());
				list.add(line);
			}
		}
		return list;
	}

	/**
	 * @param list
	 *            将爬取到的含有CN、ipv4的数据存到HTJF.txt中
	 * @throws FileNotFoundException
	 */
	public static void savetxt(List<String> listarray)
			throws FileNotFoundException {
		/* 将A文件中的内容,保存到B文件中 */
		// BufferedReader bufr=new BufferedReader(new InputStreamReader(new
		// FileInputStream("D://xuan.txt")));
		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://HTJF.txt")));

		try {
			System.out.println("有多少条记录:" + listarray.size());
			System.out.println("爬到的资源");
			/*
			 * List<String> cnList=new ArrayList<String>(); List<String>
			 * ipv4List=new ArrayList<String>(); List<String> ipList=new
			 * ArrayList<String>();
			 */

			String[] str = new String[10];

			for (String mail : listarray) {
				System.out.println("====>" + mail);
				str = mail.split("\\|");
				String line = null;

				// int length=str.length;

				bufw.write(str[3]);
				bufw.write(",");
				bufw.write(str[2]);
				bufw.write(",");
				bufw.write(str[1]);

				bufw.newLine();// /换行
				bufw.flush();// 刷新

			}
			bufw.close();//
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}// ///

	/**
	 * 读取文本,并Post到ip运营商查询网站中
	 * 
	 * @throws IOException
	 */
	public static void readtxt() throws IOException {
		BufferedReader bufr = new BufferedReader(new InputStreamReader(
				new FileInputStream("E://HTJF.txt")));

		String[] array = new String[10];
		Document doc = null;
		String line = null;
		while ((line = bufr.readLine()) != null) {
			array = line.split("\\,");
			System.out.println("ip地址:" + array[0]);
			String searchtextIp = array[0];
			testPost(searchtextIp);// 发送请求并过滤,调用3000几次

		}
	}

	/**
	 * @throws IOException
	 *             使用java程序模拟页面发送http的post请求,并过滤标签
	 */
	public static void testPost(String iptest) throws IOException {
		/**
		 * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using
		 * java.net.URL and //java.net.URLConnection
		 * 
		 * 使用页面发送请求的正常流程:在页面http://www.faircanton.com/message/loginlytebox.
		 * asp中输入用户名和密码,然后按登录,
		 * 跳转到页面http://www.faircanton.com/message/check.asp进行验证 验证的的结果返回到另一个页面
		 * 
		 * 使用java程序发送请求的流程:使用URLConnection向http://www.faircanton.com/message/
		 * check.asp发送请求 并传递两个参数:用户名和密码 然后用程序获取验证结果
		 */
		URL url = new URL("http://wq.apnic.net/apnic-bin/whois.pl");
		URLConnection connection = url.openConnection();
		/**
		 * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
		 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
		 */
		connection.setDoOutput(true);
		/**
		 * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
		 */
		OutputStreamWriter out = new OutputStreamWriter(
				connection.getOutputStream(), "8859_1");
		out.write("searchtext=" + iptest + "&form_type=advanced"); // 向页面传递数据。post的关键所在!
		// out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在!
		// remember to clean up
		out.flush();
		out.close();
		/**
		 * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
		 * text/plain Content-type: application/x-www-form-urlencoded
		 * Content-length: 99 username=bob password=someword
		 */
		// 一旦发送成功,用以下方法就可以得到服务器的回应:
		String sCurrentLine;
		String sTotalString;
		sCurrentLine = "";
		sTotalString = "";
		InputStream l_urlStream;
		l_urlStream = connection.getInputStream();
		// 传说中的三层包装阿!

		String mail_regex = "<(.[^>]*)>";// /过滤标签的规则
		Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象

		BufferedReader l_reader = new BufferedReader(new InputStreamReader(
				l_urlStream));
		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://IpHTML.txt")));
		while ((sCurrentLine = l_reader.readLine()) != null) {
			// sTotalString += sCurrentLine + "/r/n";

			// Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配
			// Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配
			sCurrentLine = sCurrentLine.replaceAll(mail_regex, "").trim();
			bufw.write(sCurrentLine);
			bufw.newLine();// /换行
			bufw.flush();// 刷新

		}
		bufw.close();
		// System.out.println("页面相应的内容");
		// System.out.println(sTotalString);
		System.out.println("第一次过滤完毕,开始下一轮过滤");
		saveLastIP();// 第二次过滤
	}// ///

	/**
	 * @throws IOException
	 *             匹配ip所需要的字段1
	 */
	public static void saveLastIP() throws IOException {
		BufferedReader bufr = new BufferedReader(new InputStreamReader(
				new FileInputStream("E://IpHTML.txt")));
		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://Ip1.txt")));
		String[] mail_regex1 = { "inetnum:", "netname:", "descr:", "country:" };// /IP的匹配规则

		String line = null;
		while ((line = bufr.readLine()) != null) {
			for (int i = 0; i < mail_regex1.length; i++) {
				Pattern p = Pattern.compile(mail_regex1[i]);// 将规则封装成对象
				Matcher m = p.matcher(line);// 一行一行地进行匹配

				while (m.find()) {
					Pattern p2 = Pattern.compile("\\s*|\t|\r|\n");
					Matcher m2 = p2.matcher(line);
					String line2 = m2.replaceAll("");
					bufw.write(line2);
					bufw.newLine();// /换行
					bufw.flush();// 刷新
				}
			}

		}
		bufw.close();

		System.out.println("第二次过滤完毕,开始下一轮过滤");
		saveLastIP2();// 第三次过滤

	}

	/**
	 * @throws IOException
	 *             匹配ip所需要的字段
	 */
	public static void saveLastIP2() throws IOException {
		BufferedReader bufr = new BufferedReader(new InputStreamReader(
				new FileInputStream("E://Ip1.txt")));
		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://Ip2.txt")));
		String[] array = new String[2];
		String[] temp = new String[2];
		IpModel ipModel = new IpModel();

		int k = 1;
		String line = null;
		while ((line = bufr.readLine()) != null) {
			array = line.split("\\:");
			if ("inetnum".equals(array[0])) {
				temp = array[1].split("\\-");
				ipModel.setStartIp(temp[0]);
				ipModel.setEndIp(temp[1]);
			} else if ("netname".equals(array[0])) {
				if (array[1].indexOf("-") > 0) {
					temp = array[1].split("\\-");
					ipModel.setProvince(temp[0]);
					ipModel.setAttribution(temp[1]);
				} else {
					ipModel.setProvince("");
					ipModel.setAttribution(array[1]);
				}

			} else if ("descr".equals(array[0])) {
				if (k == 1) {
					try {
						if (StringUtil.isBlank(array[1])) {
							ipModel.setOperator("");
						} else {
							ipModel.setOperator(array[1]);
						}
					} catch (ArrayIndexOutOfBoundsException e) {
						System.out.println("数组越界!");
						e.printStackTrace();
					}

				} else if (k == 2) {
					try {
						if (StringUtil.isBlank(array[1])) {
							ipModel.setOperator("");
						} else {
							ipModel.setOperator(array[1]);
						}
					} catch (ArrayIndexOutOfBoundsException e) {
						System.out.println("数组越界!");
						e.printStackTrace();
					}

				} else if (k == 3) {
					try {
						if (StringUtil.isBlank(array[1])) {
							ipModel.setOperator("");
						} else {
							ipModel.setOperator(array[1]);
						}
					} catch (ArrayIndexOutOfBoundsException e) {
						System.out.println("数组越界!");
						e.printStackTrace();
					}
				} else if (k == 4) {
					try {
						if (StringUtil.isBlank(array[1])) {
							ipModel.setOperator("");
						} else {
							ipModel.setOperator(array[1]);
						}
					} catch (ArrayIndexOutOfBoundsException e) {
						System.out.println("数组越界!");
						e.printStackTrace();
					}
				}
				k++;
			} else if ("country".equals(array[0])) {
				ipModel.setCountry(array[1]);
			}

		}
		bufw.write(ipModel.getCountry());
		bufw.write("  ");// /换行
		bufw.write(ipModel.getAttribution()); // 归属地
		bufw.write("  ");// /换行
		bufw.write(ipModel.getProvince());
		bufw.write("  ");// /换行
		bufw.write(ipModel.getOperator());// 运营商
		bufw.write("  ");// /换行
		bufw.write(ipModel.getStartIp());
		bufw.write("  ");// /换行
		bufw.write(ipModel.getEndIp());

		bufw.newLine();// /换行
		bufw.flush();// 刷新

		bufw.close();

		SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss");
		String ipId = getRandomString(14) + sdf.format(new Date());

		ipModel.setIpId(ipId);

		System.out.println("匹配完毕保存到数据库");
		MySql ipsqlSql = new MySql();
		ipsqlSql.insertIp(ipModel);
		// return ipModel;
	}

	/**
	 * @param length
	 * @return 生成随机数
	 */
	public static String getRandomString(int length) { // length表示生成字符串的长度
		String base = "abcdefghijklmnopqrstuvwxyz0123456789";
		Random random = new Random();
		StringBuffer sb = new StringBuffer();
		for (int i = 0; i < length; i++) {
			int number = random.nextInt(base.length());
			sb.append(base.charAt(number));
		}
		return sb.toString();
	}

}// ///

class MySql {

	public static String username;
	public static String password;
	public static Connection connection;
	public static PreparedStatement ps;

	// //构造函数
	public MySql() {

		String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull";
		String username = "root";
		String password = "";
		// 加载驱动程序以连接数据库
		try {
			Class.forName("com.mysql.jdbc.Driver");
			connection = DriverManager.getConnection(url, username, password);
		}
		// 捕获加载驱动程序异常
		catch (ClassNotFoundException cnfex) {
			System.err.println("装载 JDBC/ODBC 驱动程序失败");
			cnfex.printStackTrace();
		}
		// 捕获连接数据库异常
		catch (SQLException sqlex) {
			System.err.println("无法连接数据库");
			sqlex.printStackTrace();
		}

	}

	/**
	 * @param ipModel
	 *            private String country;//国家地区 private String province;//省份
	 *            private String operator;//运营商 private String attribution;//归属地
	 *            private String startIp;//起始Ip private String endIp;//结束Ip
	 * 
	 */
	public void insertIp(IpModel ipModel) {
		MySql ipsql = new MySql();
		List<IpModel> list = ipsql.findIp(ipModel);
		if (list.size() > 0) {
			System.out.println("已存在有数据");
		} else {
			try {

				ps = connection
						.prepareStatement("insert into iptable (ip_id,country,province,operator,attribution,startIp,endIp) values (?,?,?,?,?,?,?)");
				/*
				 * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss");
				 * String ipId=sdf.format(new Date());
				 */
				ps.setString(1, ipModel.getIpId());
				ps.setString(2, ipModel.getCountry());
				ps.setString(3, ipModel.getProvince());
				ps.setString(4, ipModel.getOperator());
				ps.setString(5, ipModel.getAttribution());
				ps.setString(6, ipModel.getStartIp());
				ps.setString(7, ipModel.getEndIp());
				ps.executeUpdate();
				System.out.println("记录插入成功");

			} catch (SQLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	/**
	 * @param ipModel
	 *            更新update table set a=REPLACE(a,'1','2');
	 */
	public void updateIp(IpModel ipModel) {

		try {

			ps = connection
					.prepareStatement("update iptable set(country=?,province=?,operator=?,attribution=?,startIp,endIp=?) where ip_id=?");
			ps.setString(1, ipModel.getIpId());
			ps.setString(2, ipModel.getCountry());
			ps.setString(3, ipModel.getProvince());
			ps.setString(4, ipModel.getOperator());
			ps.setString(5, ipModel.getAttribution());
			ps.setString(6, ipModel.getStartIp());
			ps.setString(7, ipModel.getEndIp());
			ps.executeUpdate();

		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	/**
	 * @param ipModel
	 * @return 查询
	 */
	public List<IpModel> findIp(IpModel ipModel) {
		java.util.List<IpModel> list = new ArrayList<IpModel>();
		try {

			ps = connection
					.prepareStatement("select * from iptable where startIp=? and endIp=?");
			ps.setString(1, ipModel.getStartIp());
			ps.setString(2, ipModel.getEndIp());

			ResultSet rs = ps.executeQuery();
			IpModel ipmodel = new IpModel();

			while (rs.next()) {
				ipmodel.setStartIp(rs.getString("ip_id"));
				ipmodel.setStartIp(rs.getString("startIp"));
				ipmodel.setStartIp(rs.getString("endIp"));
				list.add(ipmodel);
			}

		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return list;

	}

}


ip属性:

package com.htjf.ip;

/**
 * @author Qixuan
 * 
 */
public class IpModel {
	private String ipId;
	private String country;// 国家地区
	private String province;// 省份
	private String operator;// 运营商
	private String attribution;// 归属地
	private String startIp;// 起始Ip
	private String endIp;// 结束Ip

	public String getIpId() {
		return ipId;
	}

	public void setIpId(String ipId) {
		this.ipId = ipId;
	}

	public String getCountry() {
		return country;
	}

	public void setCountry(String country) {
		this.country = country;
	}

	public String getProvince() {
		return province;
	}

	public void setProvince(String province) {
		this.province = province;
	}

	public String getOperator() {
		return operator;
	}

	public void setOperator(String operator) {
		this.operator = operator;
	}

	public String getAttribution() {
		return attribution;
	}

	public void setAttribution(String attribution) {
		this.attribution = attribution;
	}

	public String getStartIp() {
		return startIp;
	}

	public void setStartIp(String startIp) {
		this.startIp = startIp;
	}

	public String getEndIp() {
		return endIp;
	}

	public void setEndIp(String endIp) {
		this.endIp = endIp;
	}

}



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值