[list]
[*]以下代码为抓取ip地址并入库分析,代码仅供参考,并未做任何处理....
public static void main(String[] args) {
String crawl_url = "http://www.cnblogs.com/xioxu/archive/2009/05/03/1448322.html";
StringBuilder sb = new StringBuilder();
try{
URL instance = new URL(crawl_url);
URLConnection con = instance.openConnection();
BufferedReader bufferReader = new BufferedReader(new InputStreamReader(con.getInputStream()));
String readLine;
while((readLine = bufferReader.readLine()) != null){
// cache in memory
sb.append(readLine);
}
}catch(Exception ex){
}
parseIp(sb.toString());
}
private static void parseIp(String html){
List<String> parseList = new ArrayList<String>();
String regex = "((\\d{2,}\\.){3}\\d{2,})\\__((\\d{2,}\\.){3}\\d{2,})\\__([^a-zA-Z]{2,})\\__([^a-zA-Z]{2,})";
Pattern p = Pattern.compile(regex);
Matcher matcher = p.matcher(html);
while(matcher.find()){
String str = matcher.group();
str = str.replaceAll("[\\<\\/]", "");
parseList.add(str);
System.out.println(str);
/*String startIp = matcher.group(0);
String endIp = matcher.group(1);
String province = matcher.group(2);
String routeType = matcher.group(3);
System.out.println("startIp: " + startIp + "\tendIp: " + endIp + "\tprovince: " + province + "\trouteType: " + routeType);*/
}
List<Ip> ipList = toIpList(parseList);
for(Ip ip : ipList){
System.out.println(ip.toString());
}
}
private static List<Ip> toIpList(List<String> list){
List<Ip> ipList = new ArrayList<Ip>();
Ip ip = null;
for(String line : list){
ip = new Ip();
String[] asArray = toIpArray(line);
ip.setStartIp(asArray[0]);
ip.setEndIp(asArray[1]);
ip.setProvince(asArray[2]);
ip.setRouteType(asArray[3]);
ipList.add(ip);
}
return ipList;
}
private static String[] toIpArray(String line){
String[] toArray = new String[5];
int pos = 0;
int length = "__".length();
int idx = 0;
while((pos = line.indexOf("__")) > -1){
String value = line.substring(0,pos);
toArray[idx++] = value;
line = line.substring(pos + length);
}
return toArray;
}
[/list]
[*]以下代码为抓取ip地址并入库分析,代码仅供参考,并未做任何处理....
public static void main(String[] args) {
String crawl_url = "http://www.cnblogs.com/xioxu/archive/2009/05/03/1448322.html";
StringBuilder sb = new StringBuilder();
try{
URL instance = new URL(crawl_url);
URLConnection con = instance.openConnection();
BufferedReader bufferReader = new BufferedReader(new InputStreamReader(con.getInputStream()));
String readLine;
while((readLine = bufferReader.readLine()) != null){
// cache in memory
sb.append(readLine);
}
}catch(Exception ex){
}
parseIp(sb.toString());
}
private static void parseIp(String html){
List<String> parseList = new ArrayList<String>();
String regex = "((\\d{2,}\\.){3}\\d{2,})\\__((\\d{2,}\\.){3}\\d{2,})\\__([^a-zA-Z]{2,})\\__([^a-zA-Z]{2,})";
Pattern p = Pattern.compile(regex);
Matcher matcher = p.matcher(html);
while(matcher.find()){
String str = matcher.group();
str = str.replaceAll("[\\<\\/]", "");
parseList.add(str);
System.out.println(str);
/*String startIp = matcher.group(0);
String endIp = matcher.group(1);
String province = matcher.group(2);
String routeType = matcher.group(3);
System.out.println("startIp: " + startIp + "\tendIp: " + endIp + "\tprovince: " + province + "\trouteType: " + routeType);*/
}
List<Ip> ipList = toIpList(parseList);
for(Ip ip : ipList){
System.out.println(ip.toString());
}
}
private static List<Ip> toIpList(List<String> list){
List<Ip> ipList = new ArrayList<Ip>();
Ip ip = null;
for(String line : list){
ip = new Ip();
String[] asArray = toIpArray(line);
ip.setStartIp(asArray[0]);
ip.setEndIp(asArray[1]);
ip.setProvince(asArray[2]);
ip.setRouteType(asArray[3]);
ipList.add(ip);
}
return ipList;
}
private static String[] toIpArray(String line){
String[] toArray = new String[5];
int pos = 0;
int length = "__".length();
int idx = 0;
while((pos = line.indexOf("__")) > -1){
String value = line.substring(0,pos);
toArray[idx++] = value;
line = line.substring(pos + length);
}
return toArray;
}
[/list]