昨天新开的坑。
131653条数据,来自2014年末泄露到网上的12306账号信息,包括邮箱(部分QQ邮箱),姓名,电话,用户名,密码,身份证号。
还在做数据的格式化存储……
10/14更新代码:
1、数据格式化存储
# -*- coding: gbk -*-
#2016/10/13 13:30
#12306泄露数据分析
import csv
csvfile = file(r'D:\python27\py\loc.csv', 'rb')
reader = csv.reader(csvfile)
loc=[]
for line in reader:
loc.append(line)
def findLoc(theId):
num=0
for item in loc:
if(loc[num][1]==theId):
return loc[num][0]
break
else: num=num+1
#print 'Given id not found'
return 0
hk=0
count=0
capital=0
f=open(r'C:\Users\Administrator\Desktop\data.txt','r')
csvfile = file('dat.csv', 'wb')
hkcsvfile = file('hkdat.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerow(['email_1','key','name','id','user_name','phone_number','email_2','birth','location','capital'])
hkwriter = csv.writer(hkcsvfile)
hkwriter.writerow(['email_1','key','name','id','user_name','phone_number','email_2','birth','location','capital'])
alldata=[]
hkdata=[]
for line in f.readlines():
data=line.split('----')
none=0
for st in data[3]:
# print str(st)+' '+str(data[2])
if(st>'9'and st!='X'):
none=1
#print st+' '+data[2]
break
#print 'none : '+str(none)
if(none==1 or len(data[3])!=18):
print data[3]+' '+data[2]
pot=data[6].find('@')
print data[6][pot+1:]
hk=hk+1
data.append('unknown_birth')
data.append('港澳台地区')#编码方式换成GBK就不会在这里乱码了
data.append(2)
hkdata.append(data)
hkwriter.writerow(data)
#print 'none : '+str(none)
continue
else:
#print 'check---------------------------------------------'
data.append(data[3][6:14])#birth
data.append(findLoc(data[3][:6]))#location
if(data[3][2:4]=='01'):#capital
data.append(1)
capital=capital+1
else : data.append(0)
#print data
alldata.append(data)#在excel显示csv文件时,id字段后三位为0,实际数据没有变化,故忽略这个问题
#后面涉及到性别确定的时候再解决它
# print 'data: '+str(data)
writer.writerow(data)
count=count+1
if(count%1000==0):
print 'count : '+str(count)
#print 'alldata 5th : '+str(alldata[4])
f.close()
csvfile.close()
hkcsvfile.close()
print 'done'
print 'data amount : '+str(count)
print 'hongkong id amount : '+str(hk)
#print 'capital amount : '+str(capital)
2、年龄分布统计
import csv
csvfile = file(r'D:\python27\py\datas.csv','rb')
newcsv=file(r'D:\python27\py\newcsv.csv','wb')
reader = csv.reader(csvfile)
writer=csv.writer(newcsv)
a=0
data=[]
for line in reader:
level=line[7][:4]
#print level
#line.append(level)
data.append(level)
#writer.writerow(line)
csvfile.close()
newcsv.close()
def count(datalist,item):
num=0
for a in datalist:
if(a==item):
num=num+1
print num
return num
countlist=[]
for n in range(1940,2010,1):
countlist.append([n,count(data,str(n))])
print countlist
数据统计出来之前,昨晚立的flag:
今天简单用excel作了个图:
恩。先这样。