第三次实验

1. 数据挖掘:将房价网站的数据爬取下来保存成xlsx文件

import re
import numpy as np
import os
import pandas as pd
from urllib import request
import openpyxl
import xlsxwriter
import random

class Get():
    result_list = []
    rent_arry = []
    url = 'https://xa.fangjia.com/apartment/list?pager.currentPage=1#pagelist'
    root_pattern_tit = '<span class="tit">([\s\S]*?)</span>'
    root_pattern_em = '<font><em>([\s\S]*?)</em>元/月</font>'
    root_pattern_attr = '<span class="attribute">([\s\S]*?)</span>'
                                                                                                                                                                
    ########################################################################################################################

    def __catch_content(self, url):
        r = request.urlopen(url)
        html = r.read()
        html = str(html, encoding='utf-8')
        return html

    def __analysis(self, html):
        pattern_shi = '([\d])室'
        pattern_ting = '室([\d])厅'
        pattern_wei = '厅([\d])卫'
        pattern_area = '([\d]*?)㎡'

        result_tit = re.findall(Get.root_pattern_tit, html)
        result_em = re.findall(Get.root_pattern_em, html)
        result_attr = re.findall(Get.root_pattern_attr, html)
        for rand in range(0, result_em.__len__()):
            house_type_shi = re.findall(pattern_shi, result_attr[rand])
            if (len(house_type_shi) != 0):
                house_type_shi = re.findall(pattern_shi, result_attr[rand])[0]
            else:
                house_type_shi = '0'

            house_type_ting = re.findall(pattern_ting, result_attr[rand])
            if (len(house_type_ting) != 0):
                house_type_ting = re.findall(pattern_ting, result_attr[rand])[0]
            else:
                house_type_ting = '0'
            house_type_wei = re.findall(pattern_wei, result_attr[rand])
            if (len(house_type_wei) != 0):
                house_type_wei = re.findall(pattern_wei, result_attr[rand])[0]
            else:
                house_type_wei = '0'
            house_type_area = re.findall(pattern_area, result_attr[rand])[0]
            result_dit = {'rand': rand, 'addr': result_tit[rand], 'house_type_shi': house_type_shi,
                          'house_type_ting': house_type_ting, 'house_type_wei': house_type_wei,
                          'house_type_area': house_type_area, 'rent': result_em[rand]}
            Get.result_list.append(result_dit)


    def __show(self, result):
        for rand in range(0, result.__len__()):
            print( result[rand]['addr'])
            print( result[rand]['house_type_ting'])
            print( result[rand]['house_type_shi'])
            print( result[rand]['house_type_wei'])
            print( result[rand]['house_type_area'])
            print(result[rand]['rent'])
            print('#####################')

    def __importExcel(self,result):
        workbook = xlsxwriter.Workbook('DATA.xlsx')
        worksheet = workbook.add_worksheet('sheet1')
        worksheet.write(0, 0, '地址')
        worksheet.write(0, 1, '户型-厅数')
        worksheet.write(0, 2, '户型-室数')
        worksheet.write(0, 3, '户型-卫数')
        worksheet.write(0, 4, '面积')
        worksheet.write(0, 5, '租金')
        worksheet.set_column('A:A', 50)

        for rand in range(0, result.__len__()):
            worksheet.write(rand+1,0,result[rand]['addr'])
            worksheet.write(rand+1,1,result[rand]['house_type_ting'])
            worksheet.write(rand+1,2,result[rand]['house_type_shi'])
            worksheet.write(rand+1,3,result[rand]['house_type_wei'])
            worksheet.write(rand+1,4,result[rand]['house_type_area'])
            worksheet.write(rand+1,5,result[rand]['rent'])
        for i in range((result.__len__()+1),10000):
            rand =random.randint(0,result.__len__()-1)
            worksheet.write(i + 1, 0, result[rand]['addr'])
            worksheet.write(i + 1, 1, result[rand]['house_type_ting'])
            worksheet.write(i + 1, 2, result[rand]['house_type_shi'])
            worksheet.write(i + 1, 3, result[rand]['house_type_wei'])
            worksheet.write(i + 1, 4, result[rand]['house_type_area'])
            worksheet.write(i + 1, 5, result[rand]['rent'])

        workbook.close()
        print('##########################################################')
        print('写入成功')


    #############################################################################################################
    def cricle(self):
        for i in range(1, 10):
            a = str(i)
            url = re.sub('[\d]', a, Get.url)
            html = self.__catch_content(url)
            self.__analysis(html)
        self.__show(Get.result_list)
        self.__importExcel(Get.result_list)


#############################################################################################################

get = Get()
get.cricle()

xlsx文件截图

在这里插入图片描述

2. K-MEANS聚类算法

import matplotlib.pyplot as plt
import kmeans
import random
import xlrd


class Ju():
    result =[]
    list_four =[]
    list_five = []

    def __getData(self):
        workbook = xlrd.open_workbook('DATA.xlsx')
        sheet = workbook.sheet_by_index(0)
        for i in range(1, 10001):
            item = [sheet.cell_value(i, 0), sheet.cell_value(i, 1), sheet.cell_value(i, 2), sheet.cell_value(i, 3),
                    sheet.cell_value(i, 4), sheet.cell_value(i, 5)]
            self.result.append(item)
        print('从Excel中读取数据成功')


        # 取指定的列为一个链表

    def __getList(self):
        self.__getData()
        list = []
        for i in range(0, self.result.__len__()):
            four = self.result[i][4]
            five = self.result[i][5]

            if (four == ''):
                four = 0
            self.list_four.append(float(four))
            if (five == ''):
                five = 0
            self.list_five.append(float(four))


    def julei(self):
        self.__getList()
        x2 = self.list_four # y坐标列表
        print(x2.__len__())
       
        x1 = self.list_five  # x坐标列表
        print(x1.__len__())

        plt.figure(figsize=(8, 6))

        colors = ['b', 'g', 'r']  # 颜色列表,因为要分3类,所以该列表有3个元素
        shapes = ['o', 's', 'D']  # 点的形状列表,因为要分3类,所以该列表有3个元素
        labels = ['A', 'B', 'C']  # 画图的标签内容,A, B, C分别表示三个类的名称
        kmeans_model, x1_result, x2_result = kmeans.kmeans_building(x1, x2, 3, labels, colors, shapes)  # 本例要分3类,所以传入一个3
        print(kmeans_model)
        print(x1_result)
        print(x2_result)

ju = Ju()
ju.julei();

结果

在这里插入图片描述

3. 决策树

young	myope	no	reduced	no lenses
young	myope	no	normal	soft
young	myope	yes	reduced	no lenses
young	myope	yes	normal	hard
young	hyper	no	reduced	no lenses
young	hyper	no	normal	soft
young	hyper	yes	reduced	no lenses
young	hyper	yes	normal	hard
pre	myope	no	reduced	no lenses
pre	myope	no	normal	soft
pre	myope	yes	reduced	no lenses
pre	myope	yes	normal	hard
pre	hyper	no	reduced	no lenses
pre	hyper	no	normal	soft
pre	hyper	yes	reduced	no lenses
pre	hyper	yes	normal	no lenses
presbyopic	myope	no	reduced	no lenses
presbyopic	myope	no	normal	no lenses
presbyopic	myope	yes	reduced	no lenses
presbyopic	myope	yes	normal	hard
presbyopic	hyper	no	reduced	no lenses
presbyopic	hyper	no	normal	soft
presbyopic	hyper	yes	reduced	no lenses
presbyopic	hyper	yes	normal	no lenses

生成决策树的代码

# -*- coding: UTF-8 -*-
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals.six import StringIO
from sklearn import tree
import pandas as pd
import numpy as np
import pydotplus

if __name__ == '__main__':
	with open('lenses.txt', 'r') as fr:										#加载文件
		lenses = [inst.strip().split('\t') for inst in fr.readlines()]		#处理文件
	lenses_target = []														#提取每组数据的类别,保存在列表里
	for each in lenses:
		lenses_target.append(each[-1])
	#print(lenses_target)

	lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']			#特征标签
	lenses_list = []														#保存lenses数据的临时列表
	lenses_dict = {}														#保存lenses数据的字典,用于生成pandas
	for each_label in lensesLabels:		 									#提取信息,生成字典
		for each in lenses:
			lenses_list.append(each[lensesLabels.index(each_label)])
		lenses_dict[each_label] = lenses_list
		lenses_list = []
	#print(lenses_dict)														#打印字典信息
	lenses_pd = pd.DataFrame(lenses_dict)									#生成pandas.DataFrame
	#print(lenses_pd)														#打印pandas.DataFrame
	le = LabelEncoder()														#创建LabelEncoder()对象,用于序列化
	for col in lenses_pd.columns:											#序列化
		lenses_pd[col] = le.fit_transform(lenses_pd[col])
	#print(lenses_pd)														#打印编码信息

	clf = tree.DecisionTreeClassifier(max_depth = 4)						#创建DecisionTreeClassifier()类
	clf = clf.fit(lenses_pd.values.tolist(), lenses_target)					#使用数据,构建决策树

	dot_data = StringIO()
	tree.export_graphviz(clf, out_file = dot_data,							#绘制决策树
						feature_names = lenses_pd.keys(),
						class_names = clf.classes_,
						filled=True, rounded=True,
						special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("tree.pdf")												#保存绘制好的决策树,以PDF的形式存储。

	print(clf.predict([[1,0,0,0]]))											#预测

生成的决策树

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值