今天做了一个小任务,使用回归分析预测北京pm2.5的浓度。
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # 用于线性回归
from sklearn.model_selection import train_test_split
df = pd.read_csv('D:\game\pm2.5\北京PM2.5浓度回归分析训练赛\pm25_train.csv', engine='python')
查看数据发现日期格式是字符串格式,例如:“2019-01-01”,首先第一步操作要把字符串类型的日期,拆分成年、月、日、周几,之后再把这4个特征添加到数据表中。
def data_format(dt):
time_list = []
t = time.strptime(dt, '%Y-%m-%d')
time_list.append(t.tm_year)
time_list.append(t.tm_mon)
time_list.append(t.tm_mday)
time_list.append(t.tm_wday)
return time_list
date = df['date'].tolist()
jieguo = []
for dt in date:
jieguo.append(data_format(dt=dt))
df_time = pd.DataFrame(jieguo)
df_time.columns=['year', 'mon', 'day','week']
df_data =