第三次实验

最新推荐文章于 2021-03-30 19:47:46 发布

明天请叫我早点起床

最新推荐文章于 2021-03-30 19:47:46 发布

阅读量248

点赞数

分类专栏：算法

本文链接：https://blog.csdn.net/weixin_44346035/article/details/102469036

版权

算法专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1. 数据挖掘：将房价网站的数据爬取下来保存成xlsx文件

import re
import numpy as np
import os
import pandas as pd
from urllib import request
import openpyxl
import xlsxwriter
import random

class Get():
    result_list = []
    rent_arry = []
    url = 'https://xa.fangjia.com/apartment/list?pager.currentPage=1#pagelist'
    root_pattern_tit = '<span class="tit">([\s\S]*?)</span>'
    root_pattern_em = '<font><em>([\s\S]*?)</em>元/月</font>'
    root_pattern_attr = '<span class="attribute">([\s\S]*?)</span>'
                                                                                                                                                                
    ########################################################################################################################

    def __catch_content(self, url):
        r = request.urlopen(url)
        html = r.read()
        html = str(html, encoding='utf-8')
        return html

    def __analysis(self, html):
        pattern_shi = '([\d])室'
        pattern_ting = '室([\d])厅'
        pattern_wei = '厅([\d])卫'
        pattern_area = '([\d]*?)㎡'

        result_tit = re.findall(Get.root_pattern_tit, html)
        result_em = re.findall(Get.root_pattern_em, html)
        result_attr = re.findall(Get.root_pattern_attr, html)
        for rand in range(0, result_em.__len__()):
            house_type_shi = re.findall(pattern_shi, result_attr[rand])
            if (len(house_type_shi) != 0):
                house_type_shi = re.findall(pattern_shi, result_attr[rand])[0]
            else:
                house_type_shi = '0'

            house_type_ting = re.findall(pattern_ting, result_attr[rand])
            if (len(house_type_ting) != 0):
                house_type_ting = re.findall(pattern_ting, result_attr[rand])[0]
            else:
                house_type_ting = '0'
            house_type_wei = re.findall(pattern_wei, result_attr[rand])
            if (len(house_type_wei) != 0):
                house_type_wei = re.findall(pattern_wei, result_attr[rand])[0]
            else:
                house_type_wei = '0'
            house_type_area = re.findall(pattern_area, result_attr[rand])[0]
            result_dit = {'rand': rand, 'addr': result_tit[rand], 'house_type_shi': house_type_shi,
                          'house_type_ting': house_type_ting, 'house_type_wei': house_type_wei,
                          'house_type_area': house_type_area, 'rent': result_em[rand]}
            Get.result_list.append(result_dit)


    def __show(self, result):
        for rand in range(0, result.__len__()):
            print( result[rand]['addr'])
            print( result[rand]['house_type_ting'])
            print( result[rand]['house_type_shi'])
            print( result[rand]['house_type_wei'])
            print( result[rand]['house_type_area'])
            print(result[rand]['rent'])
            print('#####################')

    def __importExcel(self,result):
        workbook = xlsxwriter.Workbook('DATA.xlsx')
        worksheet = workbook.add_worksheet('sheet1')
        worksheet.write(0, 0, '地址')
        worksheet.write(0, 1, '户型-厅数')
        worksheet.write(0, 2, '户型-室数')
        worksheet.write(0, 3, '户型-卫数')
        worksheet.write(0, 4, '面积')
        worksheet.write(0, 5, '租金')
        worksheet.set_column('A:A', 50)

        for rand in range(0, result.__len__()):
            worksheet.write(rand+1,0,result[rand]['addr'])
            worksheet.write(rand+1,1,result[rand]['house_type_ting'])
            worksheet.write(rand+1,2,result[rand]['house_type_shi'])
            worksheet.write(rand+1,3,result[rand]['house_type_wei'])
            worksheet.write(rand+1,4,result[rand]['house_type_area'])
            worksheet.write(rand+1,5,result[rand]['rent'])
        for i in range((result.__len__()+1),10000):
            rand =random.randint(0,result.__len__()-1)
            worksheet.write(i + 1, 0, result[rand]['addr'])
            worksheet.write(i + 1, 1, result[rand]['house_type_ting'])
            worksheet.write(i + 1, 2, result[rand]['house_type_shi'])
            worksheet.write(i + 1, 3, result[rand]['house_type_wei'])
            worksheet.write(i + 1, 4, result[rand]['house_type_area'])
            worksheet.write(i + 1, 5, result[rand]['rent'])

        workbook.close()
        print('##########################################################')
        print('写入成功')


    #############################################################################################################
    def cricle(self):
        for i in range(1, 10):
            a = str(i)
            url = re.sub('[\d]', a, Get.url)
            html = self.__catch_content(url)
            self.__analysis(html)
        self.__show(Get.result_list)
        self.__importExcel(Get.result_list)


#############################################################################################################

get = Get()
get.cricle()

xlsx文件截图

在这里插入图片描述

2. K-MEANS聚类算法

import matplotlib.pyplot as plt
import kmeans
import random
import xlrd


class Ju():
    result =[]
    list_four =[]
    list_five = []

    def __getData(self):
        workbook = xlrd.open_workbook('DATA.xlsx')
        sheet = workbook.sheet_by_index(0)
        for i in range(1, 10001):
            item = [sheet.cell_value(i, 0), sheet.cell_value(i, 1), sheet.cell_value(i, 2), sheet.cell_value(i, 3),
                    sheet.cell_value(i, 4), sheet.cell_value(i, 5)]
            self.result.append(item)
        print('从Excel中读取数据成功')


        # 取指定的列为一个链表

    def __getList(self):
        self.__getData()
        list = []
        for i in range(0, self.result.__len__()):
            four = self.result[i][4]
            five = self.result[i][5]

            if (four == ''):
                four = 0
            self.list_four.append(float(four))
            if (five == ''):
                five = 0
            self.list_five.append(float(four))


    def julei(self):
        self.__getList()
        x2 = self.list_four # y坐标列表
        print(x2.__len__())
       
        x1 = self.list_five  # x坐标列表
        print(x1.__len__())

        plt.figure(figsize=(8, 6))

        colors = ['b', 'g', 'r']  # 颜色列表，因为要分3类，所以该列表有3个元素
        shapes = ['o', 's', 'D']  # 点的形状列表，因为要分3类，所以该列表有3个元素
        labels = ['A', 'B', 'C']  # 画图的标签内容，A, B, C分别表示三个类的名称
        kmeans_model, x1_result, x2_result = kmeans.kmeans_building(x1, x2, 3, labels, colors, shapes)  # 本例要分3类，所以传入一个3
        print(kmeans_model)
        print(x1_result)
        print(x2_result)

ju = Ju()
ju.julei();

结果

在这里插入图片描述

3. 决策树

young	myope	no	reduced	no lenses
young	myope	no	normal	soft
young	myope	yes	reduced	no lenses
young	myope	yes	normal	hard
young	hyper	no	reduced	no lenses
young	hyper	no	normal	soft
young	hyper	yes	reduced	no lenses
young	hyper	yes	normal	hard
pre	myope	no	reduced	no lenses
pre	myope	no	normal	soft
pre	myope	yes	reduced	no lenses
pre	myope	yes	normal	hard
pre	hyper	no	reduced	no lenses
pre	hyper	no	normal	soft
pre	hyper	yes	reduced	no lenses
pre	hyper	yes	normal	no lenses
presbyopic	myope	no	reduced	no lenses
presbyopic	myope	no	normal	no lenses
presbyopic	myope	yes	reduced	no lenses
presbyopic	myope	yes	normal	hard
presbyopic	hyper	no	reduced	no lenses
presbyopic	hyper	no	normal	soft
presbyopic	hyper	yes	reduced	no lenses
presbyopic	hyper	yes	normal	no lenses

生成决策树的代码

# -*- coding: UTF-8 -*-
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals.six import StringIO
from sklearn import tree
import pandas as pd
import numpy as np
import pydotplus

if __name__ == '__main__':
	with open('lenses.txt', 'r') as fr:										#加载文件
		lenses = [inst.strip().split('\t') for inst in fr.readlines()]		#处理文件
	lenses_target = []														#提取每组数据的类别，保存在列表里
	for each in lenses:
		lenses_target.append(each[-1])
	#print(lenses_target)

	lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']			#特征标签
	lenses_list = []														#保存lenses数据的临时列表
	lenses_dict = {}														#保存lenses数据的字典，用于生成pandas
	for each_label in lensesLabels:		 									#提取信息，生成字典
		for each in lenses:
			lenses_list.append(each[lensesLabels.index(each_label)])
		lenses_dict[each_label] = lenses_list
		lenses_list = []
	#print(lenses_dict)														#打印字典信息
	lenses_pd = pd.DataFrame(lenses_dict)									#生成pandas.DataFrame
	#print(lenses_pd)														#打印pandas.DataFrame
	le = LabelEncoder()														#创建LabelEncoder()对象，用于序列化
	for col in lenses_pd.columns:											#序列化
		lenses_pd[col] = le.fit_transform(lenses_pd[col])
	#print(lenses_pd)														#打印编码信息

	clf = tree.DecisionTreeClassifier(max_depth = 4)						#创建DecisionTreeClassifier()类
	clf = clf.fit(lenses_pd.values.tolist(), lenses_target)					#使用数据，构建决策树

	dot_data = StringIO()
	tree.export_graphviz(clf, out_file = dot_data,							#绘制决策树
						feature_names = lenses_pd.keys(),
						class_names = clf.classes_,
						filled=True, rounded=True,
						special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("tree.pdf")												#保存绘制好的决策树，以PDF的形式存储。

	print(clf.predict([[1,0,0,0]]))											#预测

生成的决策树

在这里插入图片描述

明天请叫我早点起床

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
第三次实验

1. 数据挖掘：将房价网站的数据爬取下来保存成xlsx文件import reimport numpy as npimport osimport pandas as pdfrom urllib import requestimport openpyxlimport xlsxwriterimport randomclass Get(): result_list = []...
复制链接

扫一扫

专栏目录