关闭

ip库处理

282人阅读 评论(0) 收藏 举报
分类:

原始的纯真ip库有两个问题

1、地区没有拆分国家省市区县,需要程序二次拆分

2、有一些不规范数据,即学校网吧之类的,排重手工整理


程序里的ipdata  xuexiao.csv可见http://download.csdn.net/detail/u011750989/9283149

package com.java.ipku;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class IpUtil {
	
	static String[] provinces={"北京市","天津市","上海市","重庆市","河北省","山西省","辽宁省","吉林省","黑龙江省","江苏省","浙江省","安徽省","福建省","江西省","山东省","河南省","湖北省","湖南省","广东省","海南省","四川省","贵州省","云南省","陕西省","甘肃省","青海省","台湾省","内蒙古","广西","西藏","宁夏","新疆","香港","澳门"};
	static String[] directcitys={"北京市","天津市","上海市","重庆市"};
	
	static String[] xinjiang_key={"地区","州","市"};
	//内蒙
	static String[] nm_key={"盟","市"};
	//海南
	static String[] hn_key={"市","县"};
	static String[] other_key={"市","州"};

	public static void startSplitRegion(String region,Area area)
	{
		String country1="";
		String  province1="";
		String  city1="";
		String  subcity1="";
		//String region=ia.getRegion();
		//匹配省得到国家,ip库没有中国
		int is_china=0;
		for (String province:provinces)
		{
			Pattern pat = Pattern.compile("^"+province);
			Matcher mat = pat.matcher(region);
			
	  if (mat.find())
	  {
		//  ia.setCountry("中国");
		//  ia.setProvince(province);
		  is_china=1;
		//  System.out.println("country:"+"中国");
		 // System.out.println("province:"+province);
		  country1="中国";
		  province1=province;
		  
		  int is_drcity=0;
		  String excludeProvince=region.substring(province.length());
		  String subcity="";
		  
			for (String directcity:directcitys)
			{
				 pat = Pattern.compile("^"+directcity);
				 mat = pat.matcher(region);
				 if (mat.find())
				 {
					 is_drcity=1;
						//ia.setCity(directcity);
				//	  System.out.println("directcity:"+directcity);
				//	  System.out.println("区:"+  excludeProvince);
					  city1=directcity;
					  subcity1=excludeProvince;
						break;
				 }
				
			
			}
			if ( excludeProvince.length()>0 && is_drcity==0)
			{
				if (province.equals("新疆"))
				{
					for (String xk:xinjiang_key)
					{
						int inx=excludeProvince.indexOf(xk);
						if (inx>0)
						{
						//	ia.setCity(excludeProvince.substring(0,inx+1));
							//ia.setSubcity(excludeProvince.substring(inx+1));
					//		System.out.println("city:"+excludeProvince.substring(0,inx+1));
						//	System.out.println("Subcity:"+excludeProvince.substring(inx+1));
							city1=excludeProvince.substring(0,inx+1);
							subcity1=excludeProvince.substring(inx+1);
							
							break;
						}
					}
				}
				else if (province.equals("内蒙古"))
				{
					for (String nk:nm_key)
					{
						int inx=excludeProvince.indexOf(nk);
						if (inx>0)
						{
						//	System.out.println("city:"+excludeProvince.substring(0,inx+1));
						//	System.out.println("Subcity:"+excludeProvince.substring(inx+1));
							city1=excludeProvince.substring(0,inx+1);
							subcity1=excludeProvince.substring(inx+1);
							
							break;
						}
					}
				}
				else if (province.equals("海南省"))
				{
					for (String hn:hn_key)
					{
						int inx=excludeProvince.indexOf(hn);
						if (inx>0)
						{
							//System.out.println("city:"+excludeProvince.substring(0,inx+1));
						//	System.out.println("Subcity:"+excludeProvince.substring(inx+1));
							city1=excludeProvince.substring(0,inx+1);
							subcity1=excludeProvince.substring(inx+1);
							
							break;
						}
					}
				}
				else
				{
					for (String ok:other_key)
					{
						int inx=excludeProvince.indexOf(ok);
						if (inx>0)
						{
						//	System.out.println("city:"+excludeProvince.substring(0,inx+1));
						//	System.out.println("Subcity:"+excludeProvince.substring(inx+1));
							city1=excludeProvince.substring(0,inx+1);
							subcity1=excludeProvince.substring(inx+1);
							
							break;
						}
					}
				}
				
			}
			
			break;
	  }
	
			
		}
		
		if (is_china==0)
		 {
			//吉林市长春市 类似这种有12条记录,统一置为中国
			if (region.contains("大学") || region.contains("网吧") || region.contains("学院") || region.contains("市"))
				//System.out.println("Country:"+"中国");
				country1="中国";
			else 
				//System.out.println("Country:"+region);
				country1=region;
		 }
	//	return country1+"\t"+province1+"\t"+city1+"\t"+subcity1+"\t";
		if (city1.contains("大学") || city1.contains("网吧"))
			city1="";
		if (subcity1.contains("大学") || subcity1.contains("网吧") || subcity1.contains("宿舍"))
			subcity1="";
		
		area.setCountry1(country1);
		area.setProvince1(province1);
		area.setCity1(city1);
		area.setSubcity1(subcity1);
		
	}
	
	public static void initxuexiao(HashMap<String,Area> areamaps)
	{
		BufferedReader br=null;
		try {
			 br=new BufferedReader(new FileReader("D:\\xuexiao.csv"));
			 String line="";
			 while((line=br.readLine())!=null)
			 {
				 String[] datas=line.split(",");
				 Area area=new Area();
				 area.setCountry1(datas[1]);
				 area.setProvince1(datas[2]);
				 area.setCity1(datas[3]);
				 area.setSubcity1(datas[4]);
				 areamaps.put(datas[0], area);
				 
			 }
			
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		finally
		{
			try {
				br.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		
	}
	
	public static void main(String[] args) throws IOException
	{
	//startSplitRegion1("甘肃省兰州市");
		BufferedReader br=new BufferedReader(new FileReader("D:\\IPData.txt"));
		BufferedWriter bw=new BufferedWriter(new FileWriter("D:\\ipku.txt"));
		String line="";
		int i=0;
		HashMap<String,Area> areamaps=new HashMap<String,Area>(500);
		initxuexiao(areamaps);
		while ((line=br.readLine())!=null)
		{
			String[] datas=line.split("\t");
			Area area=new Area();
			if (areamaps.containsKey(datas[2]))
			{
				area=areamaps.get(datas[2]);
				
			}
			else
			startSplitRegion(datas[2],area);
			
		//	System.out.println("country:"+area.getCountry1()+" province:"+area.getProvince1()+"city:"+area.getCity1()+
	//				"subcity:"+area.getSubcity1());
			bw.write(datas[0]+"\t"+datas[1]+"\t"+area.getCountry1()+"\t"+area.getProvince1()
			+"\t"+area.getCity1()+"\t"+area.getSubcity1()+"\t"+datas[2]
					);
			bw.newLine();
		//	i++;
	//		if (i>100)
		//		break;
		}
		br.close();
		bw.close();
	}

}
package com.java.ipku;

public class Area {
	public String getCountry1() {
		return country1;
	}
	public void setCountry1(String country1) {
		this.country1 = country1;
	}
	public String getProvince1() {
		return province1;
	}
	public void setProvince1(String province1) {
		this.province1 = province1;
	}
	public String getCity1() {
		return city1;
	}
	public void setCity1(String city1) {
		this.city1 = city1;
	}
	public String getSubcity1() {
		return subcity1;
	}
	public void setSubcity1(String subcity1) {
		this.subcity1 = subcity1;
	}
	private String country1="";
	private String province1="";
	private String city1="";
	private String subcity1="";

}



0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:137481次
    • 积分:2374
    • 等级:
    • 排名:第15717名
    • 原创:97篇
    • 转载:25篇
    • 译文:0篇
    • 评论:11条
    文章分类
    最新评论