孤立森林算法的离群点检测,包含设置隔一段时间运行检测函数、读取数据库、插入数据库、保存异常检测模型、可视化异常分数与异常值直方图关系、插入uuid、当前日期、
import pandas as pd
import cx_Oracle
import plotly.express as px
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import datetime
from sklearn.externals import joblib
import time,os
def print_ts(message):
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), message)
def detection(interval, command):
print_ts("-" * 100)
print_ts("Command %s" % command)
print_ts("Starting every %s seconds." % interval)
print_ts("-" * 100)
while True:
try:
time_remaining = interval - time.time() % interval
print_ts("Sleeping until %s (%s seconds)..." % ((time.ctime(time.time() + time_remaining)), time_remaining))
time.sleep(time_remaining)
print_ts("Starting command.")
status = os.system(command)
print_ts("-" * 100)
print_ts("Command status = %s." % status)
df_end = pd.DataFrame()
# 避免编码问题带来的乱码
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
# 连接数据库,下面括号里内容根据自己实际情况填写
conn = cx_Oracle.connect('PV/1234@192.168.1.212:1521/orcl')
# to_sql函数不能用conn,只能用engine
engine = create_engine('oracle+cx_oracle://PV:Root1234@192.168.1.202:1521/orcl?charset=utf8')
print("连接成功")
'''
第一种获取数据方法
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
# 使用execute方法执行SQL查询语句
result = cursor.execute('Select * from PV.PV_INVERTER_DATA')
# 获取所有数据
all_data = cursor.fetchall()
#print('游标获取的所有数据', all_data)
'''
# 第二种:read_sql_query获取数据,筛除干扰数据
sql = "SELECT * FROM PV_INVERTER_DATA where EQUIPMENT_STATUS <>'待机:无光照' and EQUIPMENT_STATUS <>'待机:光照检测'"
# 根据id分类,得到每个id对应的数据
sqlid = 'select t.inverter_id from PV_INVERTER_DATA t group by inverter_id'
df = pd.read_sql_query(sql, conn)
df_id = pd.read_sql_query(sqlid, conn)
# 查询所有数据大小
# print('数据大小', df.shape, "\n------------")
# 不同id的数据进行离群点检测
for i in range(0, len(df_id)):
id = df_id.iloc[i]['INVERTER_ID'] # 读取到多个id值
sqldata = "select * from PV_INVERTER_DATA t where inverter_id ={0} and EQUIPMENT_STATUS <>'待机:无光照' and EQUIPMENT_STATUS <>'待机:光照检测'".format(id)
df_data = pd.read_sql_query(sqldata, conn)
# 寻找需要检测的列,将不需要的列进行去除
data1 = df_data.drop(['ID', 'EQUIPMENT_STATUS', 'CREATE_TIME'], axis=1)
print("最终参与的数据", data1)
# 第一步:标准化数据
scaler = StandardScaler()
np_scaled = scaler.fit_transform(data1)
data2 = pd.DataFrame(np_scaled)
# 训练孤立森林算法
outliers_fraction = 0.0001
#我这里数据表35列,而且为了算法稳定,设置多棵树,默认为100
model = IsolationForest(contamination=outliers_fraction,max_features=35,n_estimators=1000)
model.fit(data2)
# 这里可以保存Model(注:save文件夹要预先建立,否则会报错)
joblib.dump(model, 'save/clf.pkl{}'.format(id))
# 预测
# 这里应该是将最后三条最新数据作为测试集了!!!
for i in range(0, len(df_id)):
id_new = df_id.iloc[i]['INVERTER_ID']
#oracle数据库写法不可以使用 TOP 1或limit 1 等获取最后几条记录
sqldata = "SELECT * FROM(select * from PV_INVERTER_DATA t where inverter_id ={0} order by t.CREATE_TIME desc) where rownum<2".format(id_new)
df_orgin = pd.read_sql_query(sqldata, conn)
print('三条数据{0}'.format(id_new), df_orgin)
# 第一步筛选出某一列符合要求的设备数据记录 df_start
df_start = df_orgin[(df_orgin['EQUIPMENT_STATUS'] != '待机:无光照') & (df_orgin['EQUIPMENT_STATUS'] != '待机:光照检测')]
if df_start.empty:
print("没有符合条件的记录")
pass
else:
# print('符合条件的记录', df_start)
# print(df_start.shape)
# 删除不需要进行异常检测的三列
df_new = df_start.drop(['ID', 'EQUIPMENT_STATUS', 'CREATE_TIME'], axis=1)
print("最终参与的数据", df_new)
# 如果保存了模型,可以读取Model进行预测
model = joblib.load('save/clf.pkl{}'.format(id_new))
#异常分数
df_start['scores'] = model.decision_function(df_new)
#异常值 1表示正常,-1表示异常
df_start['anomaly'] = pd.Series(model.predict(df_new)) #
#这里是一些可视化,因为我的异常值比例设的很小很小,因此图中红色异常分数划分是这样的。
# 1.异常分数与异常值关系直方图
# fig = px.histogram(df_start, x='scores', color='anomaly')
# fig.show()
# 2.选择三个特征观察3d散点图
# fig = px.scatter_3d(df_start, x='PV1_V',
# y='PV2_V',
# z='PV3_V',
# color='anomaly'
# )
# fig.show()
if model.predict(df_new) == 1:
print('预测这条id={0}记录正常'.format(id_new))
else:
print('预测这条id={0}记录异常'.format(id_new))
df_new = pd.DataFrame(df_start.loc[df_start['anomaly'] == -1])
print('最终预测异常的完整数据记录', df_new)
# df_new['TASK_ID'] = str(uuid.uuid4()) #增加uuid一列的写法
#为了传入数据库,需要 删除多余的异常值和异常分数列
df_new.drop('anomaly', axis=1, inplace=True)
df_new.drop('scores', axis=1, inplace=True)
#保存为csv
df_new.to_csv("predict-anamoly.csv", na_rep="NULL", encoding='ANSI')
#保存到数据库,增加插入时间字段,获取当前日期时间
df_new['INSERT_TIME'] = datetime.datetime.now()
df_new.to_sql(name='pv_auto_alarm_real_time', con=engine, index=False, if_exists='append', chunksize=1)
print('Read from and write to oracle table successfully!')
except Exception as e:
print(e)
# 关闭数据库连接
conn.close()
if __name__ == '__main__':
#避免编码问题带来的乱码
#24小时
#interval =86400
command = r"ls"
detection(interval, command)