2021-10-25

行业类别

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

import seaborn as sns
import scipy.stats as st
import os
import re

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
# 显示所有info,默认100
pd.set_option("display.max_info_columns", 2000)
df = pd.read_excel('数据分析助理笔试题.xls', skiprows=1)
df.head().append(df.tail())
数据期行业代码经纬度注册资本(人民币元)企业经营日期企业登记设立日期企业缴纳社保人员数
0201512K7010113.857559,22.581071500000.02013-12-01 00:00:002012-07-05 00:00:0059.0
1201512F5132113.844955,22.7150528175400.02012-11-01 00:00:002012-09-06 00:00:00285.0
2201512F5261113.857559,22.58107125000000.02013-04-01 00:00:002012-07-27 00:00:00178.0
3201512C2231113.856368,22.7368604000000.02012-12-01 00:00:002012-08-22 00:00:0028.0
4201512L7212113.803833,22.7605728000000.02014-01-01 00:00:002012-08-02 00:00:0031.0
22216201912C3971NaN20000000.02019-12-01 00:00:002019-10-11 00:00:00NaN
22217201912N7810113.883831,22.554986NaN00:00:0000:00:00NaN
22218201912S9221113.830132,22.742925NaN2018-03-01 00:00:0000:00:00NaN
22219201912N7810113.859368,22.618116NaN2018-04-01 00:00:0000:00:00NaN
22220201912P8321113.811647,22.733314NaN00:00:0000:00:00NaN
df.shape
(22221, 7)
df.drop_duplicates(subset=None, keep='first', inplace=True)
df.shape
(21204, 7)
df['类别'] = df['行业代码'].apply(lambda x:x[:1])
df.head().append(df.tail())
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21204 entries, 0 to 22220
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   数据期         21204 non-null  int64  
 1   行业代码        21204 non-null  object 
 2   经纬度         18491 non-null  object 
 3   注册资本(人民币元)  18928 non-null  float64
 4   企业经营日期      21204 non-null  object 
 5   企业登记设立日期    21204 non-null  object 
 6   企业缴纳社保人员数   20106 non-null  float64
 7   类别          21204 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.5+ MB
industry = df.copy()
industry_group = industry.groupby(['类别'])
#声明一个读写对象
writer = pd.ExcelWriter('industry.xls',engine='xlsxwriter')
for name, group in industry_group:
    #分别将表写入Excel中的Sheet1、Sheet2、Sheet3
    group.to_excel(writer,sheet_name=name)
#保存读写的内容
writer.save()
date_group = industry.groupby(['数据期'])
date_num_industry = []
date = []
num_industry = []
# 查看行业随着日期公司数量的变化
for name, group in date_group:
    date.append(name//100)
    num_industry.append(group.shape[0])
    date_num_industry.append([name, group.shape[0]])
    print('{}年{}月,行业数量为{}'.format(str(name)[:4], str(name)[-2:], group.shape[0]))
    
2015年12月,行业数量为1957
2016年12月,行业数量为3476
2017年12月,行业数量为4418
2018年12月,行业数量为5250
2019年12月,行业数量为6103
date_num_industry, date, num_industry
([[201512, 1957],
  [201612, 3476],
  [201712, 4418],
  [201812, 5250],
  [201912, 6103]],
 [2015, 2016, 2017, 2018, 2019],
 [1957, 3476, 4418, 5250, 6103])
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(6, 4))
title = plt.suptitle('产业总量的变化', fontsize=16)
fig.subplots_adjust(top=0.92, wspace=0.3)
ax = fig.add_subplot(111)
ax.set_xlabel('数据期', fontsize=12)
ax.set_ylabel('企业数量', fontsize=12)
ax.bar(date, num_industry, color='yellowgreen')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dWSroCv1-1635240656601)(output_16_0.png)]

s_industry = industry.groupby(['数据期', '类别'])
date_type = []
cate_ind = []
num_ind = []
for inx, gp in s_industry:
    date_type.append(inx[0])
    cate_ind.append(inx[1])
    num_ind.append(gp.shape[0])

date_ind_change = pd.DataFrame()
date_ind_change['数据期'] = date_type
date_ind_change['类别'] = cate_ind
date_ind_change['数量'] = num_ind
date_ind_change
plt.figure(figsize=(16, 10))
cp = sns.countplot(x='类别', hue='数据期', data=industry)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('不同时期下各类别产业的数量统计图',fontsize=18)        #图的名称

Text(0.5, 1.0, '不同时期下各类别产业的数量统计图')


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vANRudNo-1635240656607)(output_20_1.png)]

for idx, gp in s_industry:
    print(idx, gp.shape[0])
(201512, 'C') 1208
(201512, 'D') 5
(201512, 'E') 50
(201512, 'F') 281
(201512, 'G') 39
(201512, 'H') 60
(201512, 'I') 18
(201512, 'K') 85
(201512, 'L') 142
(201512, 'M') 24
(201512, 'N') 1
(201512, 'O') 13
(201512, 'P') 18
(201512, 'Q') 7
(201512, 'R') 6
(201612, 'C') 2356
(201612, 'D') 7
(201612, 'E') 66
(201612, 'F') 418
(201612, 'G') 119
(201612, 'H') 69
(201612, 'I') 37
(201612, 'J') 1
(201612, 'K') 114
(201612, 'L') 189
(201612, 'M') 41
(201612, 'N') 4
(201612, 'O') 16
(201612, 'P') 22
(201612, 'Q') 9
(201612, 'R') 8
(201712, 'C') 2793
(201712, 'D') 6
(201712, 'E') 77
(201712, 'F') 705
(201712, 'G') 152
(201712, 'H') 96
(201712, 'I') 64
(201712, 'K') 157
(201712, 'L') 221
(201712, 'M') 58
(201712, 'N') 9
(201712, 'O') 20
(201712, 'P') 28
(201712, 'Q') 9
(201712, 'R') 12
(201712, 'S') 11
(201812, 'C') 3233
(201812, 'D') 6
(201812, 'E') 90
(201812, 'F') 912
(201812, 'G') 204
(201812, 'H') 138
(201812, 'I') 92
(201812, 'K') 185
(201812, 'L') 242
(201812, 'M') 61
(201812, 'N') 6
(201812, 'O') 18
(201812, 'P') 38
(201812, 'Q') 9
(201812, 'R') 16
(201912, 'C') 3698
(201912, 'D') 10
(201912, 'E') 139
(201912, 'F') 1049
(201912, 'G') 218
(201912, 'H') 170
(201912, 'I') 90
(201912, 'J') 1
(201912, 'K') 245
(201912, 'L') 293
(201912, 'M') 73
(201912, 'N') 13
(201912, 'O') 21
(201912, 'P') 44
(201912, 'Q') 10
(201912, 'R') 16
(201912, 'S') 13
industry.head()
数据期行业代码经纬度注册资本(人民币元)企业经营日期企业登记设立日期企业缴纳社保人员数类别
0201512K7010113.857559,22.581071500000.02013-12-01 00:00:002012-07-05 00:00:0059.0K
1201512F5132113.844955,22.7150528175400.02012-11-01 00:00:002012-09-06 00:00:00285.0F
2201512F5261113.857559,22.58107125000000.02013-04-01 00:00:002012-07-27 00:00:00178.0F
3201512C2231113.856368,22.7368604000000.02012-12-01 00:00:002012-08-22 00:00:0028.0C
4201512L7212113.803833,22.7605728000000.02014-01-01 00:00:002012-08-02 00:00:0031.0L
industry['经度'] = industry['经纬度'].apply(lambda x:str(x)[:10])
industry['纬度'] = industry['经纬度'].apply(lambda x:str(x)[11:]) 
del industry['经纬度']

数据期行业代码注册资本(人民币元)企业经营日期企业登记设立日期企业缴纳社保人员数类别经度纬度
0201512K7010500000.02013-12-01 00:00:002012-07-05 00:00:0059.0K113.85755922.581071
1201512F51328175400.02012-11-01 00:00:002012-09-06 00:00:00285.0F113.84495522.715052
2201512F526125000000.02013-04-01 00:00:002012-07-27 00:00:00178.0F113.85755922.581071
3201512C22314000000.02012-12-01 00:00:002012-08-22 00:00:0028.0C113.85636822.736860
4201512L72128000000.02014-01-01 00:00:002012-08-02 00:00:0031.0L113.80383322.760572
industry['数据期'] = industry['数据期'].apply(lambda x:x//100)
industry.head()
数据期行业代码注册资本(人民币元)企业经营日期企业登记设立日期企业缴纳社保人员数类别经度纬度
02015K7010500000.02013-12-01 00:00:002012-07-05 00:00:0059.0K113.85755922.581071
12015F51328175400.02012-11-01 00:00:002012-09-06 00:00:00285.0F113.84495522.715052
22015F526125000000.02013-04-01 00:00:002012-07-27 00:00:00178.0F113.85755922.581071
32015C22314000000.02012-12-01 00:00:002012-08-22 00:00:0028.0C113.85636822.736860
42015L72128000000.02014-01-01 00:00:002012-08-02 00:00:0031.0L113.80383322.760572
industry.to_csv('industry.csv', index=False)
china = ['D', 'J', 'N', 'Q', 'S']
n_china = ['C', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'R']
data_china = industry.copy()
data_china = data_china[~data_china['类别'].isin(n_china)]

plt.figure(figsize=(10, 8))
cp = sns.countplot(x='类别', hue='数据期', data=data_china)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('不同时期下国家控制类产业的数量统计图',fontsize=18)        #图的名称
data_life = industry.copy()
data_life = data_life[data_life['类别'].isin(['C', 'F'])]

plt.figure(figsize=(6, 4))
cp = sns.countplot(x='类别', hue='数据期', data=data_life)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('人们日常消费相关产业的数量统计图',fontsize=18)        #图的名称
data_other = industry.copy()
data_other = data_other[data_other['类别'].isin(['E', 'G', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'R'])]

plt.figure(figsize=(10, 8))
cp = sns.countplot(x='类别', hue='数据期', data=data_other)
plt.xlabel('类别',fontsize=14)   #X轴的名称
plt.ylabel('产业数量',fontsize=14)    #Y轴名称
plt.title('其他产业的数量统计图',fontsize=18)        #图的名称```

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值