2021-10-25

最新推荐文章于 2024-10-13 10:05:58 发布

大佬喝可乐

最新推荐文章于 2024-10-13 10:05:58 发布

阅读量138

点赞数

分类专栏：数据分析文章标签： python

本文链接：https://blog.csdn.net/weixin_44322171/article/details/120960682

版权

数据分析专栏收录该内容

23 篇文章 1 订阅

订阅专栏

行业类别

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

import seaborn as sns
import scipy.stats as st
import os
import re

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
# 显示所有info，默认100
pd.set_option("display.max_info_columns", 2000)

df = pd.read_excel('数据分析助理笔试题.xls', skiprows=1)

df.head().append(df.tail())

	数据期	行业代码	经纬度	注册资本(人民币元)	企业经营日期	企业登记设立日期	企业缴纳社保人员数
0	201512	K7010	113.857559,22.581071	500000.0	2013-12-01 00:00:00	2012-07-05 00:00:00	59.0
1	201512	F5132	113.844955,22.715052	8175400.0	2012-11-01 00:00:00	2012-09-06 00:00:00	285.0
2	201512	F5261	113.857559,22.581071	25000000.0	2013-04-01 00:00:00	2012-07-27 00:00:00	178.0
3	201512	C2231	113.856368,22.736860	4000000.0	2012-12-01 00:00:00	2012-08-22 00:00:00	28.0
4	201512	L7212	113.803833,22.760572	8000000.0	2014-01-01 00:00:00	2012-08-02 00:00:00	31.0
22216	201912	C3971	NaN	20000000.0	2019-12-01 00:00:00	2019-10-11 00:00:00	NaN
22217	201912	N7810	113.883831,22.554986	NaN	00:00:00	00:00:00	NaN
22218	201912	S9221	113.830132,22.742925	NaN	2018-03-01 00:00:00	00:00:00	NaN
22219	201912	N7810	113.859368,22.618116	NaN	2018-04-01 00:00:00	00:00:00	NaN
22220	201912	P8321	113.811647,22.733314	NaN	00:00:00	00:00:00	NaN

df.shape

(22221, 7)

df.drop_duplicates(subset=None, keep='first', inplace=True)
df.shape

(21204, 7)

df['类别'] = df['行业代码'].apply(lambda x:x[:1])

df.head().append(df.tail())

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21204 entries, 0 to 22220
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   数据期         21204 non-null  int64  
 1   行业代码        21204 non-null  object 
 2   经纬度         18491 non-null  object 
 3   注册资本(人民币元)  18928 non-null  float64
 4   企业经营日期      21204 non-null  object 
 5   企业登记设立日期    21204 non-null  object 
 6   企业缴纳社保人员数   20106 non-null  float64
 7   类别          21204 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.5+ MB

industry = df.copy()

industry_group = industry.groupby(['类别'])

#声明一个读写对象
writer = pd.ExcelWriter('industry.xls',engine='xlsxwriter')
for name, group in industry_group:
    #分别将表写入Excel中的Sheet1、Sheet2、Sheet3
    group.to_excel(writer,sheet_name=name)
#保存读写的内容
writer.save()

date_group = industry.groupby(['数据期'])

date_num_industry = []
date = []
num_industry = []
# 查看行业随着日期公司数量的变化
for name, group in date_group:
    date.append(name//100)
    num_industry.append(group.shape[0])
    date_num_industry.append([name, group.shape[0]])
    print('{}年{}月,行业数量为{}'.format(str(name)[:4], str(name)[-2:], group.shape[0]))

2015年12月,行业数量为1957
2016年12月,行业数量为3476
2017年12月,行业数量为4418
2018年12月,行业数量为5250
2019年12月,行业数量为6103

date_num_industry, date, num_industry

([[201512, 1957],
  [201612, 3476],
  [201712, 4418],
  [201812, 5250],
  [201912, 6103]],
 [2015, 2016, 2017, 2018, 2019],
 [1957, 3476, 4418, 5250, 6103])

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

fig = plt.figure(figsize=(6, 4))
title = plt.suptitle('产业总量的变化', fontsize=16)
fig.subplots_adjust(top=0.92, wspace=0.3)
ax = fig.add_subplot(111)
ax.set_xlabel('数据期', fontsize=12)
ax.set_ylabel('企业数量', fontsize=12)
ax.bar(date, num_industry, color='yellowgreen')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dWSroCv1-1635240656601)(output_16_0.png)]

s_industry = industry.groupby(['数据期', '类别'])

date_type = []
cate_ind = []
num_ind = []
for inx, gp in s_industry:
    date_type.append(inx[0])
    cate_ind.append(inx[1])
    num_ind.append(gp.shape[0])

date_ind_change = pd.DataFrame()
date_ind_change['数据期'] = date_type
date_ind_change['类别'] = cate_ind
date_ind_change['数量'] = num_ind
date_ind_change

plt.figure(figsize=(16, 10))
cp = sns.countplot(x='类别', hue='数据期', data=industry)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('不同时期下各类别产业的数量统计图',fontsize=18)        #图的名称

Text(0.5, 1.0, '不同时期下各类别产业的数量统计图')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vANRudNo-1635240656607)(output_20_1.png)]

for idx, gp in s_industry:
    print(idx, gp.shape[0])

(201512, 'C') 1208
(201512, 'D') 5
(201512, 'E') 50
(201512, 'F') 281
(201512, 'G') 39
(201512, 'H') 60
(201512, 'I') 18
(201512, 'K') 85
(201512, 'L') 142
(201512, 'M') 24
(201512, 'N') 1
(201512, 'O') 13
(201512, 'P') 18
(201512, 'Q') 7
(201512, 'R') 6
(201612, 'C') 2356
(201612, 'D') 7
(201612, 'E') 66
(201612, 'F') 418
(201612, 'G') 119
(201612, 'H') 69
(201612, 'I') 37
(201612, 'J') 1
(201612, 'K') 114
(201612, 'L') 189
(201612, 'M') 41
(201612, 'N') 4
(201612, 'O') 16
(201612, 'P') 22
(201612, 'Q') 9
(201612, 'R') 8
(201712, 'C') 2793
(201712, 'D') 6
(201712, 'E') 77
(201712, 'F') 705
(201712, 'G') 152
(201712, 'H') 96
(201712, 'I') 64
(201712, 'K') 157
(201712, 'L') 221
(201712, 'M') 58
(201712, 'N') 9
(201712, 'O') 20
(201712, 'P') 28
(201712, 'Q') 9
(201712, 'R') 12
(201712, 'S') 11
(201812, 'C') 3233
(201812, 'D') 6
(201812, 'E') 90
(201812, 'F') 912
(201812, 'G') 204
(201812, 'H') 138
(201812, 'I') 92
(201812, 'K') 185
(201812, 'L') 242
(201812, 'M') 61
(201812, 'N') 6
(201812, 'O') 18
(201812, 'P') 38
(201812, 'Q') 9
(201812, 'R') 16
(201912, 'C') 3698
(201912, 'D') 10
(201912, 'E') 139
(201912, 'F') 1049
(201912, 'G') 218
(201912, 'H') 170
(201912, 'I') 90
(201912, 'J') 1
(201912, 'K') 245
(201912, 'L') 293
(201912, 'M') 73
(201912, 'N') 13
(201912, 'O') 21
(201912, 'P') 44
(201912, 'Q') 10
(201912, 'R') 16
(201912, 'S') 13

industry.head()

	数据期	行业代码	经纬度	注册资本(人民币元)	企业经营日期	企业登记设立日期	企业缴纳社保人员数	类别
0	201512	K7010	113.857559,22.581071	500000.0	2013-12-01 00:00:00	2012-07-05 00:00:00	59.0	K
1	201512	F5132	113.844955,22.715052	8175400.0	2012-11-01 00:00:00	2012-09-06 00:00:00	285.0	F
2	201512	F5261	113.857559,22.581071	25000000.0	2013-04-01 00:00:00	2012-07-27 00:00:00	178.0	F
3	201512	C2231	113.856368,22.736860	4000000.0	2012-12-01 00:00:00	2012-08-22 00:00:00	28.0	C
4	201512	L7212	113.803833,22.760572	8000000.0	2014-01-01 00:00:00	2012-08-02 00:00:00	31.0	L

industry['经度'] = industry['经纬度'].apply(lambda x:str(x)[:10])
industry['纬度'] = industry['经纬度'].apply(lambda x:str(x)[11:])

del industry['经纬度']

	数据期	行业代码	注册资本(人民币元)	企业经营日期	企业登记设立日期	企业缴纳社保人员数	类别	经度	纬度
0	201512	K7010	500000.0	2013-12-01 00:00:00	2012-07-05 00:00:00	59.0	K	113.857559	22.581071
1	201512	F5132	8175400.0	2012-11-01 00:00:00	2012-09-06 00:00:00	285.0	F	113.844955	22.715052
2	201512	F5261	25000000.0	2013-04-01 00:00:00	2012-07-27 00:00:00	178.0	F	113.857559	22.581071
3	201512	C2231	4000000.0	2012-12-01 00:00:00	2012-08-22 00:00:00	28.0	C	113.856368	22.736860
4	201512	L7212	8000000.0	2014-01-01 00:00:00	2012-08-02 00:00:00	31.0	L	113.803833	22.760572

industry['数据期'] = industry['数据期'].apply(lambda x:x//100)

industry.head()

	数据期	行业代码	注册资本(人民币元)	企业经营日期	企业登记设立日期	企业缴纳社保人员数	类别	经度	纬度
0	2015	K7010	500000.0	2013-12-01 00:00:00	2012-07-05 00:00:00	59.0	K	113.857559	22.581071
1	2015	F5132	8175400.0	2012-11-01 00:00:00	2012-09-06 00:00:00	285.0	F	113.844955	22.715052
2	2015	F5261	25000000.0	2013-04-01 00:00:00	2012-07-27 00:00:00	178.0	F	113.857559	22.581071
3	2015	C2231	4000000.0	2012-12-01 00:00:00	2012-08-22 00:00:00	28.0	C	113.856368	22.736860
4	2015	L7212	8000000.0	2014-01-01 00:00:00	2012-08-02 00:00:00	31.0	L	113.803833	22.760572

industry.to_csv('industry.csv', index=False)

china = ['D', 'J', 'N', 'Q', 'S']
n_china = ['C', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'R']
data_china = industry.copy()
data_china = data_china[~data_china['类别'].isin(n_china)]

plt.figure(figsize=(10, 8))
cp = sns.countplot(x='类别', hue='数据期', data=data_china)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('不同时期下国家控制类产业的数量统计图',fontsize=18)        #图的名称

data_life = industry.copy()
data_life = data_life[data_life['类别'].isin(['C', 'F'])]

plt.figure(figsize=(6, 4))
cp = sns.countplot(x='类别', hue='数据期', data=data_life)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('人们日常消费相关产业的数量统计图',fontsize=18)        #图的名称

data_other = industry.copy()
data_other = data_other[data_other['类别'].isin(['E', 'G', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'R'])]

plt.figure(figsize=(10, 8))
cp = sns.countplot(x='类别', hue='数据期', data=data_other)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('其他产业的数量统计图',fontsize=18)        #图的名称```