抓取解析ip并入库

[list]
[*]以下代码为抓取ip地址并入库分析,代码仅供参考,并未做任何处理....
public static void main(String[] args) {
String crawl_url = "http://www.cnblogs.com/xioxu/archive/2009/05/03/1448322.html";
StringBuilder sb = new StringBuilder();

try{
URL instance = new URL(crawl_url);
URLConnection con = instance.openConnection();
BufferedReader bufferReader = new BufferedReader(new InputStreamReader(con.getInputStream()));
String readLine;
while((readLine = bufferReader.readLine()) != null){
// cache in memory
sb.append(readLine);
}
}catch(Exception ex){

}

parseIp(sb.toString());
}

private static void parseIp(String html){
List<String> parseList = new ArrayList<String>();
String regex = "((\\d{2,}\\.){3}\\d{2,})\\__((\\d{2,}\\.){3}\\d{2,})\\__([^a-zA-Z]{2,})\\__([^a-zA-Z]{2,})";
Pattern p = Pattern.compile(regex);
Matcher matcher = p.matcher(html);
while(matcher.find()){
String str = matcher.group();
str = str.replaceAll("[\\<\\/]", "");
parseList.add(str);
System.out.println(str);
/*String startIp = matcher.group(0);
String endIp = matcher.group(1);
String province = matcher.group(2);
String routeType = matcher.group(3);
System.out.println("startIp: " + startIp + "\tendIp: " + endIp + "\tprovince: " + province + "\trouteType: " + routeType);*/
}

List<Ip> ipList = toIpList(parseList);
for(Ip ip : ipList){
System.out.println(ip.toString());
}
}

private static List<Ip> toIpList(List<String> list){
List<Ip> ipList = new ArrayList<Ip>();
Ip ip = null;
for(String line : list){
ip = new Ip();
String[] asArray = toIpArray(line);
ip.setStartIp(asArray[0]);
ip.setEndIp(asArray[1]);
ip.setProvince(asArray[2]);
ip.setRouteType(asArray[3]);
ipList.add(ip);
}

return ipList;
}

private static String[] toIpArray(String line){
String[] toArray = new String[5];
int pos = 0;
int length = "__".length();
int idx = 0;

while((pos = line.indexOf("__")) > -1){
String value = line.substring(0,pos);
toArray[idx++] = value;
line = line.substring(pos + length);
}

return toArray;
}
[/list]
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值