In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
sensor_data = pd.read_csv('merged-sensor-files.csv',
names=["MTU", "Time", "Power", "Cost", "Voltage"], header = 0)
weather_data = pd.read_json('weather.json', typ ='series')
In [3]:
import json
f=open('weather.json')
json_data = json.load(f)
Time = []
Temperature = []
for time, temperature in json_data.items():
Time.append(int(time))
Temperature.append(float(temperature))
temperature = pd.DataFrame({'Time':Time, 'Temperature': Temperature})
temperature
Out[3]:
In [4]:
import json
f=open('weather.json').read()
Time = []
Temperature = []
for line in f.split(','):
time, temperature = line.split(':')
time = time.replace('"','')
time = time.replace('{','')
temperature = temperature.replace('"','')
temperature = temperature.replace('}','')
#print time, temperature
Time.append(int(time))
Temperature.append(float(temperature))
In [5]:
# A quick look at the datasets
sensor_data.head(5)
Out[5]:
In [6]:
sensor_data.describe()
Out[6]:
In [7]:
sensor_data.info()
In [8]:
sensor_data.dtypes
Out[8]:
In [9]:
# Get the inconsistent rows indexes
faulty_row_idx = sensor_data[sensor_data["Power"] == " Power"].index.tolist()
faulty_row_idx
Out[9]:
In [10]:
sensor_data.drop(faulty_row_idx, inplace=True)
sensor_data[sensor_data["Power"] == " Power"].index.tolist()
Out[10]:
In [11]:
sensor_data[["Power", "Cost", "Voltage"]] = sensor_data[["Power", "Cost", "Voltage"]].astype(float)
sensor_data[["Time"]] = pd.to_datetime(sensor_data["Time"])
sensor_data['Hour'] = pd.DatetimeIndex(sensor_data["Time"]).hour
sensor_data.dtypes
Out[11]:
In [12]:
temperature_data = weather_data.to_frame()
temperature_data.reset_index(level=0, inplace=True)
temperature_data.columns = ["Time", "Temperature"]
temperature_data.dtypes
temperature_data['Temperature'] = Temperature
temperature_data["Hour"] = pd.DatetimeIndex(temperature_data["Time"]).hour
temperature_data[["Temperature"]] = temperature_data[["Temperature"]].astype(float)
temperature_data
Out[12]:
In [14]:
sensor_data.describe()
Out[14]:
In [15]:
temperature_data.describe()
Out[15]:
In [16]:
grouped_sensor_data = sensor_data.groupby(["Hour"], as_index = False).mean()
grouped_sensor_data
Out[16]:
In [17]:
grouped_temperature_data = temperature_data.groupby(["Hour"], as_index = False).mean()
grouped_temperature_data
Out[17]:
In [18]:
%pylab inline
plt.style.use('ggplot')
In [19]:
fig = plt.figure(figsize=(13,7))
plt.hist(sensor_data.Power, bins=50)
fig.suptitle('Power Histogram', fontsize = 20)
plt.xlabel('Power', fontsize = 16)
plt.ylabel('Count', fontsize = 16)
Out[19]:
In [20]:
fig = plt.figure(figsize=(13,7))
plt.bar(grouped_sensor_data.Hour, grouped_sensor_data.Power)
fig.suptitle('Power Distribution with Hours', fontsize = 20)
plt.xlabel('Hour', fontsize = 16)
plt.ylabel('Power', fontsize = 16)
plt.xticks(range(0, 24))
plt.show()
In [21]:
fig = plt.figure(figsize=(13,7))
plt.bar(grouped_temperature_data.Temperature, grouped_sensor_data.Power)
fig.suptitle('Power Distribution with Temperature', fontsize = 20)
plt.xlabel('Temperature in Fahrenheit', fontsize = 16)
plt.ylabel('Power', fontsize = 16)
plt.show()
In [22]:
merged_data = grouped_sensor_data.merge(grouped_temperature_data)
merged_data
Out[22]:
In [23]:
data =sensor_data.merge(grouped_temperature_data)
data.drop(["Time", "MTU", "Cost", "Voltage"], axis = 1, inplace = True)
data.head()
Out[23]:
In [24]:
from sklearn.cluster import KMeans
from sklearn.cross_validation import train_test_split
In [25]:
np.random.seed(1234)
train_data, test_data = train_test_split(data, test_size = 0.25, random_state = 42)
In [26]:
train_data.shape
Out[26]:
In [27]:
test_data.shape
Out[27]:
In [28]:
kmeans = KMeans(n_clusters = 4, n_jobs = 4)
kmeans_fit = kmeans.fit(train_data)
In [29]:
predict = kmeans_fit.predict(test_data)
In [30]:
test_data["Cluster"] = predict
test_data.head(20)
Out[30]:
In [31]:
label_df = pd.DataFrame({"Cluster": [0, 1, 2, 3],
"Appliances": ["Cooling System","Oven, Geyser",
"Night Lights", "Home Security Systems"]})
label_df
Out[31]:
In [32]:
result = test_data.merge(label_df)
result.head()
Out[32]:
In [33]:
result.tail()
Out[33]: