1 使用LabelEncoder()对字符串特征进行编码
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import collections
# 1 导入数据
data = pd.read_csv("weather/weather_description.csv",
index_col="datetime",
parse_dates=["datetime"])
# 1 引入
le = LabelEncoder()
# 2 构建模型,拟合,能搭建出来字符串和数字之间的一一映射关系
le.fit(["paris", "paris", "tokyo", "amsterdam"])
# 3 转换 将字符串转换为变量
le_t = le.transform(["paris", "paris", "tokyo", "amsterdam"])
print(le_t)
# 将天气描述都转换为数字形式
features = set()
for i in range(data.shape[1]):
a = set(data.iloc[:, i].value_counts().index)
features = features.union(a)
features_np = np.array(list(features))
features_pd = pd.DataFrame(features_np)
features_pd.to_csv("weather_description_features.csv")
# 对温哥华城市的数据进行转换
x = data.Vancouver
le = LabelEncoder()
le.fit(features_np)
# 处理缺失值的情况
x = x.map(lambda s: '<unknown>' if s not in le.classes_ else s)
le.classes_ = np.append(le.classes_, '<unknown>')
# 3 转换 将字符串转换为变量
le_t = le.transform(x)
print(le_t)
2 分析54种天气情况哪个天气出现的次数
重点分析下雨、下雪、晴天
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 分析54种天气情况哪个天气出现的次数
data = pd.read_csv("weather_description_features.csv")
data.columns = ["index", "features"]
rain = "rain"
snow = "snow"
sky_clear = "clear"
rain_list = []
snow_list = []
sky_clear_list = []
for x in data.features:
if rain in x:
rain_list.append(x)
if snow in x:
snow_list.append(x)
if sky_clear in x:
sky_clear_list.append(x)
print(len(rain_list))
print(len(snow_list))
print(len(sky_clear_list))
3 计算降雪-纬度之间的关系
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 1 导入数据
data = pd.read_csv("weather/weather_description.csv",
index_col="datetime",
parse_dates=["datetime"])
# 2.1 计算每个城市下雨的次数
city_names = []
rain_nums = []
snow_nums = []
for i in range(data.shape[1]):
city_names.append(data.iloc[:, i].name)
x = data.iloc[:, i].value_counts() # 每个天气情况出现的次数
is_rain = x.index.str.contains('rain') # 每个城市包含下雨的次数
rain_nums.append(x[is_rain].sum()) # 下雨次数求和
snow_nums.append(x[x.index.str.contains('snow')].sum())
# 2.2 降水量归一化
rain_nums_guiyi = [round((x - min(rain_nums)) / (max(rain_nums) - min(rain_nums)), 5) for x in rain_nums]
snow_nums_guiyi = [round((x - min(snow_nums)) / (max(snow_nums) - min(snow_nums)), 5) for x in snow_nums]
# 2.3 构建每个城市的降水量DataFrame格式
city_rain_snow_pd = pd.DataFrame(zip(*[city_names, rain_nums_guiyi, snow_nums_guiyi]),
columns=["City", "rain_nums_guiyi", "snow_nums_guiyi"])
# 3.1 读入城市数据
city_attr = pd.read_csv("weather/city_attributes.csv")
# 3.2 根据City字段进行拼接
city_attr_rain_snow_pd = pd.merge(city_rain_snow_pd, city_attr, on=["City"])
x = city_attr_rain_snow_pd.Longitude # 经度
y = city_attr_rain_snow_pd.Latitude # 纬度
city_name = city_attr_rain_snow_pd.City
rain_nums_guiyi = city_attr_rain_snow_pd.rain_nums_guiyi
snow_nums_guiyi = city_attr_rain_snow_pd.snow_nums_guiyi
# 4 计算纬度和降水量之间的关系,绘制图像
plt.plot(y, rain_nums_guiyi, 'ro')
plt.show()
4 计算降雪-降雨之间的关系
使用scatter函数绘制图像,并且点的大小代表降雪(降雨)量的大小。横纵坐标表示经纬度。
# 5 绘制散点图,点的大小代表降雨(雪)的次数
s1 = [x * 1000 for x in rain_nums_guiyi]
s2 = [x * 1000 for x in snow_nums_guiyi]
plt.scatter(x, y, s=s1, alpha=0.2, c='r')
plt.scatter(x, y, s=s2, alpha=0.5, c='k')
plt.legend(["rain","snow"])
for x1, y1, s1 in zip(x, y, city_name):
plt.text(x1, y1, s1)
plt.grid()
plt.show()