Task2

import numpy as np
import pandas as pd

test_data=pd.read_csv('car_testA_0110.csv',sep=' ')
train_data=pd.read_csv('car_train_0110.csv',sep=' ')

print('test.shape:',test_data.shape)
print('train.shape:',train_data.shape)
test.shape: (50000, 39)
train.shape: (250000, 40)

简略浏览数据

train_data.head()
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
01348907342016000213.09NaN0.01.0015.0...0.0921390.00000018.763832-1.512063-1.008718-12.100623-0.9470529.0772970.5812143.945923
13066481969732008030772.097.05.01.017315.0...0.0010700.122335-5.685612-0.489963-2.223693-0.226865-0.658246-3.9496214.593618-1.145653
2340675253472002031218.0123.00.01.05012.5...0.0644100.003345-3.2957001.8164993.554439-0.6836750.9714952.625318-0.851922-1.246135
35733253822000061138.087.00.01.05415.0...0.0692310.000000-3.4055211.4978264.7826360.0391011.2276463.040629-0.801854-1.251894
42652351731742003010987.005.05.01.01313.0...0.0000990.001655-4.4754290.1241381.364567-0.319848-1.131568-3.303424-1.998466-1.279368

5 rows × 40 columns

总览数据概况

  1. describe中有每列的统计量,个数count、平均值mean、方差std、最小值min、中位数25% 50% 75% 、以
    及最大值 看这个信息主要是瞬间掌握数据的大概的范围以及每个值的异常值的判断,比如有的时候会发现
    999 9999 -1 等值这些其实都是nan的另外一种表达方式,有的时候需要注意下
  2. info 通过info来了解数据每列的type,有助于了解是否存在除了nan以外的特殊符号异常
train_data.describe()
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
count250000.000000250000.0000002.500000e+05250000.000000250000.000000224620.000000227510.000000236487.000000250000.000000250000.000000...250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000
mean185351.79076883153.3621722.003401e+0744.9114807.7852364.5632711.6650080.780783115.52841212.577418...0.0324890.0304080.0147250.0009150.0062730.006604-0.0013740.000609-0.0040250.001834
std107121.18876372540.7999647.770250e+0450.6400817.6940101.9125152.3396460.413717196.1418283.990632...0.0387920.0493338.7791635.7710814.8809814.1247223.8036263.5553532.8647132.323680
min1.0000000.0000001.910000e+070.0000000.0000000.0000000.0000000.0000000.0000000.500000...0.0000000.000000-10.412444-15.538236-21.009214-13.989955-9.599285-11.181255-7.671327-2.350888
25%92501.75000014500.0000001.999061e+076.0000001.0000003.0000000.0000001.00000070.00000012.500000...0.0001290.000000-5.552269-0.901181-3.150385-0.478173-1.727237-3.067073-2.092178-1.402804
50%185264.50000065314.5000002.003111e+0727.0000006.0000004.0000000.0000001.000000105.00000015.000000...0.0019610.002567-3.8217700.223181-0.0585020.038427-0.995044-0.880587-1.199807-1.145588
75%278128.500000143761.2500002.008081e+0770.00000011.0000007.0000005.0000001.000000150.00000015.000000...0.0756720.0565683.5997471.2637372.8004750.5691981.5633823.2699872.7376140.044865
max370946.000000233044.0000002.019121e+07250.00000039.0000007.0000006.0000001.00000020000.00000015.000000...0.1307850.18434036.75687826.13456123.05566016.57602720.32457214.0394228.7645978.574730

8 rows × 40 columns

test_data.describe()
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
count50000.00000050000.0000005.000000e+0450000.00000050000.00000044890.00000045598.00000047287.00000050000.00000050000.000000...50000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.000000
mean556029.05338082878.2514202.003441e+0744.9228407.7794204.5562261.6811920.781081114.11606012.555210...0.0325700.030773-0.0248190.007051-0.008488-0.0301040.014609-0.0033530.013125-0.011936
std106952.40256572292.0769367.788055e+0450.5762557.6616671.9082912.3448290.413518177.2741544.034901...0.0387790.0495218.7596635.7842994.8252614.1005613.8126673.5489442.8667742.316144
min370951.0000000.0000001.910000e+070.0000000.0000000.0000000.0000000.0000000.0000000.500000...0.0000000.000000-10.196998-15.167961-21.925773-13.682825-9.282567-11.117367-6.365723-2.394516
25%463258.50000014121.2500001.999061e+076.0000001.0000003.0000000.0000001.00000069.00000012.500000...0.0001350.000000-5.575131-0.891030-3.105073-0.481952-1.697763-3.069575-2.089326-1.402958
50%556296.00000065359.0000002.003111e+0727.0000006.0000004.0000000.0000001.000000105.00000015.000000...0.0019490.002593-3.8375720.221379-0.0818360.039376-0.971210-0.877377-1.192502-1.146398
75%648862.250000143083.7500002.008091e+0770.00000011.0000007.0000005.0000001.000000150.00000015.000000...0.0758260.0620633.5312691.2576872.7845380.5600461.5725083.2769182.772742-0.010769
max741887.000000233028.0000002.019040e+07248.00000039.0000007.0000006.0000001.00000017700.00000015.000000...0.1359000.18009136.36498626.04357222.59844116.33305120.27363311.6918517.9703038.749647

8 rows × 39 columns

train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             250000 non-null  int64  
 1   name               250000 non-null  int64  
 2   regDate            250000 non-null  int64  
 3   model              250000 non-null  float64
 4   brand              250000 non-null  int64  
 5   bodyType           224620 non-null  float64
 6   fuelType           227510 non-null  float64
 7   gearbox            236487 non-null  float64
 8   power              250000 non-null  int64  
 9   kilometer          250000 non-null  float64
 10  notRepairedDamage  201464 non-null  float64
 11  regionCode         250000 non-null  int64  
 12  seller             250000 non-null  int64  
 13  offerType          250000 non-null  int64  
 14  creatDate          250000 non-null  int64  
 15  price              250000 non-null  int64  
 16  v_0                250000 non-null  float64
 17  v_1                250000 non-null  float64
 18  v_2                250000 non-null  float64
 19  v_3                250000 non-null  float64
 20  v_4                250000 non-null  float64
 21  v_5                250000 non-null  float64
 22  v_6                250000 non-null  float64
 23  v_7                250000 non-null  float64
 24  v_8                250000 non-null  float64
 25  v_9                250000 non-null  float64
 26  v_10               250000 non-null  float64
 27  v_11               250000 non-null  float64
 28  v_12               250000 non-null  float64
 29  v_13               250000 non-null  float64
 30  v_14               250000 non-null  float64
 31  v_15               250000 non-null  float64
 32  v_16               250000 non-null  float64
 33  v_17               250000 non-null  float64
 34  v_18               250000 non-null  float64
 35  v_19               250000 non-null  float64
 36  v_20               250000 non-null  float64
 37  v_21               250000 non-null  float64
 38  v_22               250000 non-null  float64
 39  v_23               250000 non-null  float64
dtypes: float64(30), int64(10)
memory usage: 76.3 MB
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             50000 non-null  int64  
 1   name               50000 non-null  int64  
 2   regDate            50000 non-null  int64  
 3   model              50000 non-null  float64
 4   brand              50000 non-null  int64  
 5   bodyType           44890 non-null  float64
 6   fuelType           45598 non-null  float64
 7   gearbox            47287 non-null  float64
 8   power              50000 non-null  int64  
 9   kilometer          50000 non-null  float64
 10  notRepairedDamage  40372 non-null  float64
 11  regionCode         50000 non-null  int64  
 12  seller             50000 non-null  int64  
 13  offerType          50000 non-null  int64  
 14  creatDate          50000 non-null  int64  
 15  v_0                50000 non-null  float64
 16  v_1                50000 non-null  float64
 17  v_2                50000 non-null  float64
 18  v_3                50000 non-null  float64
 19  v_4                50000 non-null  float64
 20  v_5                50000 non-null  float64
 21  v_6                50000 non-null  float64
 22  v_7                50000 non-null  float64
 23  v_8                50000 non-null  float64
 24  v_9                50000 non-null  float64
 25  v_10               50000 non-null  float64
 26  v_11               50000 non-null  float64
 27  v_12               50000 non-null  float64
 28  v_13               50000 non-null  float64
 29  v_14               50000 non-null  float64
 30  v_15               50000 non-null  float64
 31  v_16               50000 non-null  float64
 32  v_17               50000 non-null  float64
 33  v_18               50000 non-null  float64
 34  v_19               50000 non-null  float64
 35  v_20               50000 non-null  float64
 36  v_21               50000 non-null  float64
 37  v_22               50000 non-null  float64
 38  v_23               50000 non-null  float64
dtypes: float64(30), int64(9)
memory usage: 14.9 MB

判断数据缺失和异常

#查看每列的存在nan情况
train_data.isnull().sum()
SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType             25380
fuelType             22490
gearbox              13513
power                    0
kilometer                0
notRepairedDamage    48536
regionCode               0
seller                   0
offerType                0
creatDate                0
price                    0
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64
test_data.isnull().sum()
SaleID                  0
name                    0
regDate                 0
model                   0
brand                   0
bodyType             5110
fuelType             4402
gearbox              2713
power                   0
kilometer               0
notRepairedDamage    9628
regionCode              0
seller                  0
offerType               0
creatDate               0
v_0                     0
v_1                     0
v_2                     0
v_3                     0
v_4                     0
v_5                     0
v_6                     0
v_7                     0
v_8                     0
v_9                     0
v_10                    0
v_11                    0
v_12                    0
v_13                    0
v_14                    0
v_15                    0
v_16                    0
v_17                    0
v_18                    0
v_19                    0
v_20                    0
v_21                    0
v_22                    0
v_23                    0
dtype: int64
nan可视化
# nan可视化
missing = train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x2219108f6d0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-w7gOc4BI-1618582949849)(output_13_1.png)]

通过以上两句可以很直观的了解哪些列存在 “nan”, 并可以把nan的个数打印,主要的目的在于 nan存在的个数是
否真的很大,如果很小一般选择填充,如果使用lgb等树模型可以直接空缺,让树自己去优化,但如果nan存在的
过多、可以考虑删掉

sort_values()是pandas中比较常用的排序方法,其主要涉及以下三个参数:
by : str or list of str(字符或者字符列表) 当需要按照多个列排序时,可使用列表
ascending : bool or list of bool, default True
(是否升序排序,默认为true,降序则为false。如果是列表,则需和by指定的列表数量相同,指明每一列的排序方式)
na_position : {‘first’, ‘last’}, default ‘last’。
(如果指定排序的列中有nan值,则指定nan值放在第一个还是最后一个)

import pandas as pd
import numpy as np
df = pd.read_excel(r’D:/myExcel/1.xlsx’)
df
id name score grade
0 a bog 45.0 A
1 c jiken 67.0 B
2 d bob 23.0 A
3 b jiken 34.0 B
4 f lucy NaN A
5 e tidy 75.0 B
按照score升序排列,nan值放在最后

df.sort_values(‘score’)
id name score grade
2 d bob 23.0 A
3 b jiken 34.0 B
0 a bog 45.0 A
1 c jiken 67.0 B
5 e tidy 75.0 B
4 f lucy NaN A
按照score降序排列,nan值放在最后

df.sort_values(‘score’, ascending=False)
id name score grade
5 e tidy 75.0 B
1 c jiken 67.0 B
0 a bog 45.0 A
3 b jiken 34.0 B
2 d bob 23.0 A
4 f lucy NaN A

nan值放在第一位

df.sort_values(‘score’, na_position=‘first’)
id name score grade
4 f lucy NaN A
2 d bob 23.0 A
3 b jiken 34.0 B
0 a bog 45.0 A
1 c jiken 67.0 B
5 e tidy 75.0 B

按照’name列降序排序,id升序排列’

df.sort_values(by=[‘name’, ‘id’], ascending=[False, True])
id name score grade
5 e tidy 75.0 B
4 f lucy NaN A
3 b jiken 34.0 B
1 c jiken 67.0 B
0 a bog 45.0 A
2 d bob 23.0 A

————————————————
版权声明:本文为CSDN博主「python小工具」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/weixin_45144170/article/details/106305532

# 可视化看下缺省值
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
msno.matrix(train_data.sample(250))
<matplotlib.axes._subplots.AxesSubplot at 0x221929ddbe0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HxKDUSkH-1618582949851)(output_16_1.png)]

数据挖掘-数据预处理之缺失值可视化处理
每次处理数据时,缺失值是必须要考虑的问题。
但是手工查看每个变量的缺失值是非常麻烦的一件事情。
missingno提供了一个灵活且易于使用的缺失数据可视化和实用程序的小工具集,使您可以快速直观地总结数据集的完整性。
我们使用python来进行演练
1、首先安装程序包并加载:
pip install missingno
import missingno as msno
2、导入训练数据集
import pandas as pd
import numpy as ny
data=pd.read_csv(“model.csv”)
3、无效矩阵的数据密集显示
msno.matrix(data, labels=True)
可以快速直观地挑选出图案的数据完成

我们可以一目了然的看到每个变量的缺失情况,
变量y,X9数据是完整的,其他变量都有不同程度的缺失,
尤其是X3,X5,X7等的缺失非常严重

4、条形图
msno.bar(data)

msno.bar 是列的无效的简单可视化:

利用条形图可以更直观的看出每个变量缺失的比例和数量情况。
5、热图相关性
msno.heatmap(data)

missingno相关性热图措施无效的相关性:一个变量的存在或不存在如何强烈影响的另一个的存在:

我们看到X5与X1.1的缺失相关性为1,说明X5只要发生了缺失,那么X1.1也会缺失,
X7和X8的相关性为-1,说明X7缺失的值,那么X8没有缺失;而X7没有缺失时,X8为缺失。
6、树状图
msno.dendrogram(data)

树形图使用层次聚类算法通过它们的无效性相关性(根据二进制距离测量)将变量彼此相加。在树的每个步骤,基于哪个组合最小化剩余簇的距离来分割变量。变量集越单调,它们的总距离越接近零,并且它们的平均距离(y轴)越接近零。

总体上,图标分为两个大类,一类是数据比较完整的,一类是缺失值比较多的。
要解释此图表,要从上往下的角度阅读。
左边数据是比较完整的一类,Y和X9是完整的数据,没有缺失值,所以他们的距离为0;相对于其他变量X11也是比较完整的,距离要比其他变量小,所以先把X11加进来。其他变量以此类推。
右边是缺失值比较严重的,热图相关性里面我们看到了X5和X1.1的相关性系数为1,所以他们的距离为0,首先聚在一起;之后再跟其他进行计算距离,把距离较近的X7加进来,以此类推。
————————————————
版权声明:本文为CSDN博主「Andy_shenzl」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/Andy_shenzl/article/details/81633356

msno.bar(test_data.sample(1000))
<matplotlib.axes._subplots.AxesSubplot at 0x22193cd0940>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-d1sGxOD3-1618582949852)(output_18_1.png)]

sample(序列a,n)
功能:从序列a中随机抽取n个元素,并将n个元素生以list形式返回。
例:
from random import randint, sample
date = [randint(10,20) for _ in range(10)]
c = sample(date, 5)
print©
输出:[12, 17, 10, 12, 17]

#查看异常值检测

train_data.info()#没有异常值
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             250000 non-null  int64  
 1   name               250000 non-null  int64  
 2   regDate            250000 non-null  int64  
 3   model              250000 non-null  float64
 4   brand              250000 non-null  int64  
 5   bodyType           224620 non-null  float64
 6   fuelType           227510 non-null  float64
 7   gearbox            236487 non-null  float64
 8   power              250000 non-null  int64  
 9   kilometer          250000 non-null  float64
 10  notRepairedDamage  201464 non-null  float64
 11  regionCode         250000 non-null  int64  
 12  seller             250000 non-null  int64  
 13  offerType          250000 non-null  int64  
 14  creatDate          250000 non-null  int64  
 15  price              250000 non-null  int64  
 16  v_0                250000 non-null  float64
 17  v_1                250000 non-null  float64
 18  v_2                250000 non-null  float64
 19  v_3                250000 non-null  float64
 20  v_4                250000 non-null  float64
 21  v_5                250000 non-null  float64
 22  v_6                250000 non-null  float64
 23  v_7                250000 non-null  float64
 24  v_8                250000 non-null  float64
 25  v_9                250000 non-null  float64
 26  v_10               250000 non-null  float64
 27  v_11               250000 non-null  float64
 28  v_12               250000 non-null  float64
 29  v_13               250000 non-null  float64
 30  v_14               250000 non-null  float64
 31  v_15               250000 non-null  float64
 32  v_16               250000 non-null  float64
 33  v_17               250000 non-null  float64
 34  v_18               250000 non-null  float64
 35  v_19               250000 non-null  float64
 36  v_20               250000 non-null  float64
 37  v_21               250000 non-null  float64
 38  v_22               250000 non-null  float64
 39  v_23               250000 non-null  float64
dtypes: float64(30), int64(10)
memory usage: 76.3 MB
test_data.info()#没有异常值
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             50000 non-null  int64  
 1   name               50000 non-null  int64  
 2   regDate            50000 non-null  int64  
 3   model              50000 non-null  float64
 4   brand              50000 non-null  int64  
 5   bodyType           44890 non-null  float64
 6   fuelType           45598 non-null  float64
 7   gearbox            47287 non-null  float64
 8   power              50000 non-null  int64  
 9   kilometer          50000 non-null  float64
 10  notRepairedDamage  40372 non-null  float64
 11  regionCode         50000 non-null  int64  
 12  seller             50000 non-null  int64  
 13  offerType          50000 non-null  int64  
 14  creatDate          50000 non-null  int64  
 15  v_0                50000 non-null  float64
 16  v_1                50000 non-null  float64
 17  v_2                50000 non-null  float64
 18  v_3                50000 non-null  float64
 19  v_4                50000 non-null  float64
 20  v_5                50000 non-null  float64
 21  v_6                50000 non-null  float64
 22  v_7                50000 non-null  float64
 23  v_8                50000 non-null  float64
 24  v_9                50000 non-null  float64
 25  v_10               50000 non-null  float64
 26  v_11               50000 non-null  float64
 27  v_12               50000 non-null  float64
 28  v_13               50000 non-null  float64
 29  v_14               50000 non-null  float64
 30  v_15               50000 non-null  float64
 31  v_16               50000 non-null  float64
 32  v_17               50000 non-null  float64
 33  v_18               50000 non-null  float64
 34  v_19               50000 non-null  float64
 35  v_20               50000 non-null  float64
 36  v_21               50000 non-null  float64
 37  v_22               50000 non-null  float64
 38  v_23               50000 non-null  float64
dtypes: float64(30), int64(9)
memory usage: 14.9 MB
对于有异常值的,可以看出来‘ - ’也为空缺值,因为很多模型对nan有直接的处理,这里我们先不做处理,先替换成nan
Train_data['notRepairedDamage'].replace('-', np.nan, inplace=True)
train_data["seller"].value_counts()
1    249999
0         1
Name: seller, dtype: int64
train_data['offerType'].value_counts()
0    249991
1         9
Name: offerType, dtype: int64
#del train_data["seller"]
#del test_data["seller"]
#del train_data["offerType"]
#del test_data["offerType"]
#删除会报错,暂时没有懂原因

了解预测值的分布

train_data['price']
0           520
1          5500
2          1100
3          1200
4          3300
          ...  
249995     1200
249996     1200
249997    16500
249998    31950
249999     1990
Name: price, Length: 250000, dtype: int64
train_data['price'].value_counts()
0        7312
500      3815
1500     3587
1000     3149
1200     3071
         ... 
22985       1
10695       1
20680       1
2243        1
46888       1
Name: price, Length: 4585, dtype: int64
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值