a网络大数据程序

随机森林

import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

data_train=pd.read_csv("Train_data.csv")
data_test=pd.read_csv("Test_data.csv")

def encoding(df):
    for col in df.columns:
        if df[col].dtype == 'object':
                label_encoder = preprocessing.LabelEncoder()
                df[col] = label_encoder.fit_transform(df[col])
encoding(data_train)

X = data_train.drop(["class"], axis=1)
y = data_train["class"]
select_best_cols = SelectKBest(mutual_info_classif, k=25)
select_best_cols.fit(X, y)
selected_features = X.columns[select_best_cols.get_support()]
X = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

def classalgo_test(x_train, x_test, y_train, y_test): 
    rfc = RandomForestClassifier()
    algo = rfc
    algo.fit(x_train, y_train)
    y_test_pred = algo.predict(x_test)
    test_acc = "{:.2f}".format(accuracy_score(y_test, y_test_pred))
    return test_acc

a = classalgo_test(X_train, X_test, y_train, y_test)
print(a)

朴素贝叶斯

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv("sms_spam.csv", encoding='ISO-8859-1')

#定义恶意邮件关键词词库
words = set()


#对数据集进行数据清洗
column = 'text'  
data[column] = data[column].str.lower()  # 转为小写
data[column] = data[column].str.replace('[^a-zA-Z ]', '')  # 只保留字母和空格
data[column] = data[column].str.strip()  # 去除多余空格

#划分训练集测试集
X_train, X_test, Y_train, Y_test = train_test_split(data["text"], data["type"], test_size=0.1, random_state=42)

#构建模型
for doc in X_train:
    words.update(set(doc.split()))
X_train_counts = np.array([[doc.count(word) for word in words] for doc in X_train])
X_test_counts = np.array([[doc.count(word) for word in words] for doc in X_test])
model = MultinomialNB()
model.fit(X_train_counts, Y_train)

#评价模型
Y_pred = model.predict(X_test_counts)
accuracy = accuracy_score(Y_test, Y_pred)

#输出评价值
print( f'{accuracy:.2f}')
 

决策树算法

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder

str_list = [] 
digital_list = []  
my_dict = None  

crime_data = pd.read_csv('crime.csv')

for col in crime_data.columns:
    if crime_data[col].dtypes == 'object':
        unique_data = crime_data[col].unique()
        for item in unique_data:
            str_list.append(item)

crime_data.drop_duplicates  
crime_data.dropna  
for col in crime_data.columns:
    if crime_data[col].dtypes == 'object':
        le = LabelEncoder()
        crime_data[col] = le.fit_transform(crime_data[col])
        unique_data = crime_data[col].unique()
        for item in unique_data:
            digital_list.append(item)
        pairs = zip(str_list, digital_list)

        my_dict = {key: value for key, value in pairs}


x = crime_data.loc[:, ['NEIGHBOURHOOD', 'MONTH']]
y = crime_data.loc[:, 'TYPE']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

new_data = {"NEIGHBOURHOOD": my_dict['Sunset'], "MONTH": '3'}

prediction = dtc.predict(pd.DataFrame([new_data]))
for k, v in my_dict.items():
    if v == prediction:
        outcome = k
        break
print('根据预测可能的犯罪类型是:', outcome)

僵尸网络

import math
import os
import sys
import pickle
import numpy as np
from numpy import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV 

fp=open("feature_data.pkl",'rb')
feature_data=pickle.load(fp,encoding='iso-8859-1')
fp.close() 
    
label_file = open('y.txt','r')
label_data=[] 
for line in label_file:
    label = int(line.replace('\n','').replace('\r\n',''))  
    label_data.append(label) 
    
label_data = np.array(label_data)  

import random

n_samples, n_features = feature_data.shape 
p = list(range(n_samples)) 
random.seed(12345) 
random.shuffle(p)  
XX,yy = feature_data[p],label_data[p]  
cut_off = int(n_samples / 10)*3  
    
train_feature=XX[:cut_off] 
train_label=yy[:cut_off]   
test_feature=XX[7000:]    
test_label=yy[7000:]      

classifier = SVC(kernel='linear',C=1)  
classifier.fit(train_feature, train_label)    
fp=open("SVM_train_model.pkl",'wb')
pickle.dump(classifier, fp)    
fp.close()

probas_ = classifier.predict(test_feature) 
count=0
for i in range(test_label.shape[0]):      
    if probas_[i]==test_label[i]:         
         count+=1
            
accuracy=1.0*count/test_label.shape[0]     

print("{:.1f}".format(int(accuracy*10)/10))


 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值