raw_data.py
# coding: utf-8
import random
import csv
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
f = open("raw_data.csv", "a+", encoding='utf-8')
writer_csv = csv.writer(f)
data=pd.DataFrame(pd.read_csv('/home/henson/Desktop/huanping/huanping.csv_EDGE_NBD.csv',encoding='gb18030'))
data.head()
X = np.array(data[['Nodeid1','Nodeid2','author_degree1','author_degree2','No','isBD']])
Nodeid1=X[:,0]
Nodeid2=X[:,1]
author_degree1=X[:,2]
author_degree2=X[:,3]
No=X[:,4]
isBD=X[:,5]
data=[]
for i in range(0,len(isBD)):
data.append((Nodeid1[i],Nodeid2[i],author_degree1[i],author_degree2[i],No[i],isBD[i]))
writer_csv.writerows(data)
""" 把原始数据再写一遍到csv,保证数据的精度一直,才能做比较
"""
不知道为什么再写入的时候,数据的精度会发生改变,为了同一精度,进行对比筛选,所以把数据原封不动地写了一遍
sele_test.py
# coding: utf-8
import random
import csv
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
f = open("sele.csv", "a+", encoding='utf-8')
writer_csv = csv.writer(f)
data=pd.DataFrame(pd.read_csv('/home/henson/Desktop/huanping/huanping.csv_EDGE_NBD.csv',encoding='gb18030'))
data.head()
X = np.array(data[['Nodeid1','Nodeid2','author_degree1','author_degree2','No','isBD']])
Nodeid1=X[:,0]
Nodeid2=X[:,1]
author_degree1=X[:,2]
author_degree2=X[:,3]
No=X[:,4]
isBD=X[:,5]
data=[]
for i in range(0,len(isBD)):
if isBD[i] == 1:
data.append((Nodeid1[i], Nodeid2[i], author_degree1[i], author_degree2[i], No[i], isBD[i]))
writer_csv.writerows(data)
""" 写入label为1的数据,精度和raw_data一致,方便做比较
"""
先选出label为1的数据(根据需求来着)
after_sele.py
# coding: utf-8
import random
import csv
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
f = open("sele.csv", "a+", encoding='utf-8')
writer_csv = csv.writer(f)
data1=pd.DataFrame(pd.read_csv('sele.csv',encoding='utf-8'))
X1 = np.array(data1[['author_degree1','author_degree2','No']])
isBD1 = np.array(data1[['isBD']])
degreex1=X1[:,0]
degreex2=X1[:,1]
Nox=X1[:,2]
isBD1=X1[:,]
data=pd.DataFrame(pd.read_csv('raw_data.csv',encoding='utf-8'))
data.head()
nodeid1=np.array((data[['Nodeid1']]))
nodeid2=np.array((data[['Nodeid2']]))
nodeid1=nodeid1[:,0]
nodeid2=nodeid2[:,0]
X = np.array(data[['author_degree1','author_degree2','No']])
isBD = np.array(data[['isBD']])
data=[]
degree1=X[:,0]
degree2=X[:,1]
No=X[:,2]
isBD=isBD[:,0]
"""
d=np.array([0,0,0])
#舍弃该方便, 在这里只是做下笔记,矩阵可以这样相jian
for i in range(0,len(isBD)):
if isBD[i] == 0:
a=X[i,:]
b=X1
c=b-a
print(c[i,:])
for j in range(0,len(c)):
if (c[i,:] == d).all():
print(c[i,:])
break
else:
data.append(b[i,:])
print("find it")
"""
if isBD[969] == 0:
a = X[969, :]
print(a)
for j in range(0, 1):
b = X1[337, :]
print("b:",b)
if (a == b).all():
print("yes,equal") #拿个别样本检验是否相等
c = 0
break
else:
c = 1
#data.append((nodeid1[i],nodeid2[i],X[i,0],X[i,1],X[i,2],isBD[i]))
#print(data)
#writer_csv.writerows(data)
"""
# 进行数据的比对,A中的i行与B中的j行对比,用j来遍历完
for i in range(0,len(isBD)):
#c = 1
if isBD[i] == 0:
a=X[i,:]
for j in range(0,len(isBD1)):
b = X1[j, :]
if (a == b).all():
print(a)
c=0
break
else:c=1
if c==1:
data.append((nodeid1[i],nodeid2[i],X[i,0],X[i,1],X[i,2],isBD[i]))
print(data)
writer_csv.writerows(data)
直接追加到sele.csv,合并0和1的数据
"""
最后得到的sele.csv即可拿来当做训练集
这样的数据一定得根据需求来进行预处理,否则会影响效果。