1. 数据挖掘:将房价网站的数据爬取下来保存成xlsx文件
import re
import numpy as np
import os
import pandas as pd
from urllib import request
import openpyxl
import xlsxwriter
import random
class Get():
result_list = []
rent_arry = []
url = 'https://xa.fangjia.com/apartment/list?pager.currentPage=1#pagelist'
root_pattern_tit = '<span class="tit">([\s\S]*?)</span>'
root_pattern_em = '<font><em>([\s\S]*?)</em>元/月</font>'
root_pattern_attr = '<span class="attribute">([\s\S]*?)</span>'
########################################################################################################################
def __catch_content(self, url):
r = request.urlopen(url)
html = r.read()
html = str(html, encoding='utf-8')
return html
def __analysis(self, html):
pattern_shi = '([\d])室'
pattern_ting = '室([\d])厅'
pattern_wei = '厅([\d])卫'
pattern_area = '([\d]*?)㎡'
result_tit = re.findall(Get.root_pattern_tit, html)
result_em = re.findall(Get.root_pattern_em, html)
result_attr = re.findall(Get.root_pattern_attr, html)
for rand in range(0, result_em.__len__()):
house_type_shi = re.findall(pattern_shi, result_attr[rand])
if (len(house_type_shi) != 0):
house_type_shi = re.findall(pattern_shi, result_attr[rand])[0]
else:
house_type_shi = '0'
house_type_ting = re.findall(pattern_ting, result_attr[rand])
if (len(house_type_ting) != 0):
house_type_ting = re.findall(pattern_ting, result_attr[rand])[0]
else:
house_type_ting = '0'
house_type_wei = re.findall(pattern_wei, result_attr[rand])
if (len(house_type_wei) != 0):
house_type_wei = re.findall(pattern_wei, result_attr[rand])[0]
else:
house_type_wei = '0'
house_type_area = re.findall(pattern_area, result_attr[rand])[0]
result_dit = {'rand': rand, 'addr': result_tit[rand], 'house_type_shi': house_type_shi,
'house_type_ting': house_type_ting, 'house_type_wei': house_type_wei,
'house_type_area': house_type_area, 'rent': result_em[rand]}
Get.result_list.append(result_dit)
def __show(self, result):
for rand in range(0, result.__len__()):
print( result[rand]['addr'])
print( result[rand]['house_type_ting'])
print( result[rand]['house_type_shi'])
print( result[rand]['house_type_wei'])
print( result[rand]['house_type_area'])
print(result[rand]['rent'])
print('#####################')
def __importExcel(self,result):
workbook = xlsxwriter.Workbook('DATA.xlsx')
worksheet = workbook.add_worksheet('sheet1')
worksheet.write(0, 0, '地址')
worksheet.write(0, 1, '户型-厅数')
worksheet.write(0, 2, '户型-室数')
worksheet.write(0, 3, '户型-卫数')
worksheet.write(0, 4, '面积')
worksheet.write(0, 5, '租金')
worksheet.set_column('A:A', 50)
for rand in range(0, result.__len__()):
worksheet.write(rand+1,0,result[rand]['addr'])
worksheet.write(rand+1,1,result[rand]['house_type_ting'])
worksheet.write(rand+1,2,result[rand]['house_type_shi'])
worksheet.write(rand+1,3,result[rand]['house_type_wei'])
worksheet.write(rand+1,4,result[rand]['house_type_area'])
worksheet.write(rand+1,5,result[rand]['rent'])
for i in range((result.__len__()+1),10000):
rand =random.randint(0,result.__len__()-1)
worksheet.write(i + 1, 0, result[rand]['addr'])
worksheet.write(i + 1, 1, result[rand]['house_type_ting'])
worksheet.write(i + 1, 2, result[rand]['house_type_shi'])
worksheet.write(i + 1, 3, result[rand]['house_type_wei'])
worksheet.write(i + 1, 4, result[rand]['house_type_area'])
worksheet.write(i + 1, 5, result[rand]['rent'])
workbook.close()
print('##########################################################')
print('写入成功')
#############################################################################################################
def cricle(self):
for i in range(1, 10):
a = str(i)
url = re.sub('[\d]', a, Get.url)
html = self.__catch_content(url)
self.__analysis(html)
self.__show(Get.result_list)
self.__importExcel(Get.result_list)
#############################################################################################################
get = Get()
get.cricle()
xlsx文件截图
2. K-MEANS聚类算法
import matplotlib.pyplot as plt
import kmeans
import random
import xlrd
class Ju():
result =[]
list_four =[]
list_five = []
def __getData(self):
workbook = xlrd.open_workbook('DATA.xlsx')
sheet = workbook.sheet_by_index(0)
for i in range(1, 10001):
item = [sheet.cell_value(i, 0), sheet.cell_value(i, 1), sheet.cell_value(i, 2), sheet.cell_value(i, 3),
sheet.cell_value(i, 4), sheet.cell_value(i, 5)]
self.result.append(item)
print('从Excel中读取数据成功')
# 取指定的列为一个链表
def __getList(self):
self.__getData()
list = []
for i in range(0, self.result.__len__()):
four = self.result[i][4]
five = self.result[i][5]
if (four == ''):
four = 0
self.list_four.append(float(four))
if (five == ''):
five = 0
self.list_five.append(float(four))
def julei(self):
self.__getList()
x2 = self.list_four # y坐标列表
print(x2.__len__())
x1 = self.list_five # x坐标列表
print(x1.__len__())
plt.figure(figsize=(8, 6))
colors = ['b', 'g', 'r'] # 颜色列表,因为要分3类,所以该列表有3个元素
shapes = ['o', 's', 'D'] # 点的形状列表,因为要分3类,所以该列表有3个元素
labels = ['A', 'B', 'C'] # 画图的标签内容,A, B, C分别表示三个类的名称
kmeans_model, x1_result, x2_result = kmeans.kmeans_building(x1, x2, 3, labels, colors, shapes) # 本例要分3类,所以传入一个3
print(kmeans_model)
print(x1_result)
print(x2_result)
ju = Ju()
ju.julei();
结果
3. 决策树
young myope no reduced no lenses
young myope no normal soft
young myope yes reduced no lenses
young myope yes normal hard
young hyper no reduced no lenses
young hyper no normal soft
young hyper yes reduced no lenses
young hyper yes normal hard
pre myope no reduced no lenses
pre myope no normal soft
pre myope yes reduced no lenses
pre myope yes normal hard
pre hyper no reduced no lenses
pre hyper no normal soft
pre hyper yes reduced no lenses
pre hyper yes normal no lenses
presbyopic myope no reduced no lenses
presbyopic myope no normal no lenses
presbyopic myope yes reduced no lenses
presbyopic myope yes normal hard
presbyopic hyper no reduced no lenses
presbyopic hyper no normal soft
presbyopic hyper yes reduced no lenses
presbyopic hyper yes normal no lenses
生成决策树的代码
# -*- coding: UTF-8 -*-
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals.six import StringIO
from sklearn import tree
import pandas as pd
import numpy as np
import pydotplus
if __name__ == '__main__':
with open('lenses.txt', 'r') as fr: #加载文件
lenses = [inst.strip().split('\t') for inst in fr.readlines()] #处理文件
lenses_target = [] #提取每组数据的类别,保存在列表里
for each in lenses:
lenses_target.append(each[-1])
#print(lenses_target)
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签
lenses_list = [] #保存lenses数据的临时列表
lenses_dict = {} #保存lenses数据的字典,用于生成pandas
for each_label in lensesLabels: #提取信息,生成字典
for each in lenses:
lenses_list.append(each[lensesLabels.index(each_label)])
lenses_dict[each_label] = lenses_list
lenses_list = []
#print(lenses_dict) #打印字典信息
lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame
#print(lenses_pd) #打印pandas.DataFrame
le = LabelEncoder() #创建LabelEncoder()对象,用于序列化
for col in lenses_pd.columns: #序列化
lenses_pd[col] = le.fit_transform(lenses_pd[col])
#print(lenses_pd) #打印编码信息
clf = tree.DecisionTreeClassifier(max_depth = 4) #创建DecisionTreeClassifier()类
clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树
dot_data = StringIO()
tree.export_graphviz(clf, out_file = dot_data, #绘制决策树
feature_names = lenses_pd.keys(),
class_names = clf.classes_,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("tree.pdf") #保存绘制好的决策树,以PDF的形式存储。
print(clf.predict([[1,0,0,0]])) #预测
生成的决策树