import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
import pandas as pd
plt.rcParams['font.sans-serif'] = ['SimHei'] #解决中文显示
plt.rcParams['axes.unicode_minus'] = False #解决符号无法显示
sns.set(font='SimHei')
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('长沙租房数据.csv') # 导入数据
df.head() # 查看数据前五行
df.shape # 查看数据大小
df.info() # 查看数据基本信息
df.describe().T # 查看数据描述性统计
df.isnull().sum() # 统计缺失值情况
df.drop(['距地铁距离','楼层'],axis=1,inplace=True)
df.dropna(inplace=True)
df['房屋租金'] = df['房屋租金'].apply(lambda x:int(x.split('元')[0]))
df['房屋租金']
df['房屋面积'] = df['房屋面积'].apply(lambda x:int(x[:-2]))
df['房屋面积']
sns.boxplot(data=df,x='房屋租金')
plt.show()
sns.histplot(data=df,x='房屋租金',kde=True)
plt.show()
sns.boxplot(data=df,y='房屋面积')
plt.show()
sns.histplot(data=df,x='房屋面积',kde=True)
plt.show()
plt.scatter(x=df['房屋面积'],y=df['房屋租金'])
plt.show()
sns.countplot(data=df,x='交付方式')
plt.show()
df['出租方式'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.boxplot(data=df,y='房屋租金',x='交付方式')
plt.show()
sns.boxplot(data=df,y='房屋租金',x='出租方式')
plt.show()
df['房屋朝向'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='房屋朝向',y='房屋租金')
plt.show()
df['房屋装修'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='房屋装修',y='房屋租金')
plt.show()
# 相关性分析
sns.heatmap(df.corr(),vmax=1,annot=True,linewidths=0.5,cbar=False,cmap='YlGnBu',annot_kws={'fontsize':18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('各个因素之间的相关系数',fontsize=20)
plt.show()
import jieba
import collections
import re
import stylecloud
from PIL import Image
def draw_WorldCloud(df,pic_name,color='black'):
data = ''.join([item for item in df])
# 文本预处理 :去除一些无用的字符只提取出中文出来
new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S)
new_data = "".join(new_data)
# 文本分词
seg_list_exact = jieba.cut(new_data, cut_all=True)
result_list = []
with open('停用词库.txt', encoding='utf-8') as f: #可根据需要打开停用词库,然后加上不想显示的词语
con = f.readlines()
stop_words = set()
for i in con:
i = i.replace("\n", "") # 去掉读取每一行数据的\n
stop_words.add(i)
for word in seg_list_exact:
if word not in stop_words and len(word) > 1:
result_list.append(word)
word_counts = collections.Counter(result_list)
# 词频统计:获取前100最高频的词
word_counts_top = word_counts.most_common(100)
print(word_counts_top)
# 绘制词云图
stylecloud.gen_stylecloud(text=' '.join(result_list[:500]), # 提取500个词进行绘图
collocations=False, # 是否包括两个单词的搭配(二字组)
font_path=r'C:\Windows\Fonts\msyh.ttc', #设置字体,参考位置为 C:\Windows\Fonts\ ,根据里面的字体编号来设置
size=800, # stylecloud 的大小
palette='cartocolors.qualitative.Bold_7', # 调色板,调色网址: https://jiffyclub.github.io/palettable/
background_color=color, # 背景颜色
icon_name='fas fa-circle', # 形状的图标名称 蒙版网址:https://fontawesome.com/icons?d=gallery&p=2&c=chat,shopping,travel&m=free
gradient='horizontal', # 梯度方向
max_words=2000, # stylecloud 可包含的最大单词数
max_font_size=150, # stylecloud 中的最大字号
stopwords=True, # 布尔值,用于筛除常见禁用词
output_name=f'{pic_name}.png') # 输出图片
# 打开图片展示
img=Image.open(f'{pic_name}.png')
img.show()
draw_WorldCloud(df['房源亮点'],'房源亮点词云图') # 词云图可视化
draw_WorldCloud(df['配套设施'],'配套设施词云图') # 词云图可视化
# 编码处理
df['交付方式'].replace({'押一付三':0,'押一付一':1,'面议':2,'押一付二':3,'年付':4,'半年付':5},inplace=True)
df['房屋朝向'].replace({'东':0,'南':1,'西':2,'北':3,'南北':4,'东南':5,'东西':6,'西北':7,'不限':8},inplace=True)
df['房屋装修'].replace({'毛坯':0,'简装修':1,'精装修':2,'豪华装修':3},inplace=True)
# 特征筛选
new_df = df[['房屋租金', '交付方式', '房屋面积', '房屋朝向','房屋装修',]]
new_df
from sklearn.model_selection import train_test_split
# 准备数据
X = new_df.drop('房屋租金',axis=1)
y = new_df['房屋租金']
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print('训练集大小:',X_train.shape[0])
print('测试集大小:',X_test.shape[0])
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
# 定义一个训练模型并输出模型的评估指标
def train_model(ml_model):
print("Model is: ", ml_model)
model = ml_model.fit(X_train, y_train)
print("Training score: ", model.score(X_train,y_train))
predictions = model.predict(X_test)
r2score = r2_score(y_test, predictions)
print("r2 score is: ", r2score)
print('MAE:', mean_absolute_error(y_test,predictions))
print('MSE:', mean_squared_error(y_test,predictions))
print('RMSE:', np.sqrt(mean_squared_error(y_test,predictions)))
# 真实值和预测值的差值
sns.distplot(y_test - predictions)
# 构建多元线性回归
from sklearn.linear_model import LinearRegression
lg = LinearRegression()
train_model(lg)
# 构建knn回归
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
train_model(knn)
# 构建决策树回归
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
train_model(tree)
# 构建随机森林回归
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
train_model(forest)
# GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gbdt = GradientBoostingRegressor()
train_model(gbdt)
# 构建xgboost回归模型
from xgboost import XGBRegressor
xgb = XGBRegressor()
train_model(xgb)
# 特征重要性评分
feat_labels = X_train.columns[0:]
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f,j in zip(range(X_train.shape[1]),indices):
index_list.append(feat_labels[j])
value_list.append(importances[j])
print(f + 1, feat_labels[j], importances[j])
plt.figure(figsize=(10,6))
plt.barh(index_list[::-1],value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序',fontsize=14)
plt.show()
# 模型预测
y_pred = xgb.predict(X_test)
result_df = pd.DataFrame()
result_df['真实值'] = y_test
result_df['预测值'] = y_pred
result_df.head(10)
# 模型预测可视化
plt.figure(figsize=(10,6))
plt.plot(range(len(y_test))[:200],y_pred[:200],'b',label='predict')
plt.plot(range(len(y_test))[:200],y_test[:200],'r',label='test')
plt.legend(loc='upper right',fontsize=15)
plt.xlabel('the number of house',fontdict={'weight': 'normal', 'size': 15})
plt.ylabel('value of Price',fontdict={'weight': 'normal', 'size': 15})
plt.show()
import joblib
joblib.dump(xgb,'model.pkl')
new_df
x_data = pd.DataFrame(data=[['面议',141,'南','简装修']],columns=['交付方式','房屋面积','房屋朝向','房屋装修'])
x_data
# 编码处理
x_data['交付方式'].replace({'押一付三':0,'押一付一':1,'面议':2,'押一付二':3,'年付':4,'半年付':5},inplace=True)
x_data['房屋朝向'].replace({'东':0,'南':1,'西':2,'北':3,'南北':4,'东南':5,'东西':6,'西北':7,'不限':8},inplace=True)
x_data['房屋装修'].replace({'毛坯':0,'简装修':1,'精装修':2,'豪华装修':3},inplace=True)
x_data
model = joblib.load('model.pkl')
result = model.predict(x_data)[0]
result
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.