随机森林
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
data_train=pd.read_csv("Train_data.csv")
data_test=pd.read_csv("Test_data.csv")
def encoding(df):
for col in df.columns:
if df[col].dtype == 'object':
label_encoder = preprocessing.LabelEncoder()
df[col] = label_encoder.fit_transform(df[col])
encoding(data_train)
X = data_train.drop(["class"], axis=1)
y = data_train["class"]
select_best_cols = SelectKBest(mutual_info_classif, k=25)
select_best_cols.fit(X, y)
selected_features = X.columns[select_best_cols.get_support()]
X = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
def classalgo_test(x_train, x_test, y_train, y_test):
rfc = RandomForestClassifier()
algo = rfc
algo.fit(x_train, y_train)
y_test_pred = algo.predict(x_test)
test_acc = "{:.2f}".format(accuracy_score(y_test, y_test_pred))
return test_acc
a = classalgo_test(X_train, X_test, y_train, y_test)
print(a)
朴素贝叶斯
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
data = pd.read_csv("sms_spam.csv", encoding='ISO-8859-1')
#定义恶意邮件关键词词库
words = set()
#对数据集进行数据清洗
column = 'text'
data[column] = data[column].str.lower() # 转为小写
data[column] = data[column].str.replace('[^a-zA-Z ]', '') # 只保留字母和空格
data[column] = data[column].str.strip() # 去除多余空格
#划分训练集测试集
X_train, X_test, Y_train, Y_test = train_test_split(data["text"], data["type"], test_size=0.1, random_state=42)
#构建模型
for doc in X_train:
words.update(set(doc.split()))
X_train_counts = np.array([[doc.count(word) for word in words] for doc in X_train])
X_test_counts = np.array([[doc.count(word) for word in words] for doc in X_test])
model = MultinomialNB()
model.fit(X_train_counts, Y_train)
#评价模型
Y_pred = model.predict(X_test_counts)
accuracy = accuracy_score(Y_test, Y_pred)
#输出评价值
print( f'{accuracy:.2f}')
决策树算法
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
str_list = []
digital_list = []
my_dict = None
crime_data = pd.read_csv('crime.csv')
for col in crime_data.columns:
if crime_data[col].dtypes == 'object':
unique_data = crime_data[col].unique()
for item in unique_data:
str_list.append(item)
crime_data.drop_duplicates
crime_data.dropna
for col in crime_data.columns:
if crime_data[col].dtypes == 'object':
le = LabelEncoder()
crime_data[col] = le.fit_transform(crime_data[col])
unique_data = crime_data[col].unique()
for item in unique_data:
digital_list.append(item)
pairs = zip(str_list, digital_list)
my_dict = {key: value for key, value in pairs}
x = crime_data.loc[:, ['NEIGHBOURHOOD', 'MONTH']]
y = crime_data.loc[:, 'TYPE']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
new_data = {"NEIGHBOURHOOD": my_dict['Sunset'], "MONTH": '3'}
prediction = dtc.predict(pd.DataFrame([new_data]))
for k, v in my_dict.items():
if v == prediction:
outcome = k
break
print('根据预测可能的犯罪类型是:', outcome)
僵尸网络
import math
import os
import sys
import pickle
import numpy as np
from numpy import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
fp=open("feature_data.pkl",'rb')
feature_data=pickle.load(fp,encoding='iso-8859-1')
fp.close()
label_file = open('y.txt','r')
label_data=[]
for line in label_file:
label = int(line.replace('\n','').replace('\r\n',''))
label_data.append(label)
label_data = np.array(label_data)
import random
n_samples, n_features = feature_data.shape
p = list(range(n_samples))
random.seed(12345)
random.shuffle(p)
XX,yy = feature_data[p],label_data[p]
cut_off = int(n_samples / 10)*3
train_feature=XX[:cut_off]
train_label=yy[:cut_off]
test_feature=XX[7000:]
test_label=yy[7000:]
classifier = SVC(kernel='linear',C=1)
classifier.fit(train_feature, train_label)
fp=open("SVM_train_model.pkl",'wb')
pickle.dump(classifier, fp)
fp.close()
probas_ = classifier.predict(test_feature)
count=0
for i in range(test_label.shape[0]):
if probas_[i]==test_label[i]:
count+=1
accuracy=1.0*count/test_label.shape[0]
print("{:.1f}".format(int(accuracy*10)/10))