pandas 100题

qq_40999093?

已于 2023-05-25 16:17:30 修改

阅读量1k

点赞数 1

分类专栏： Python 文章标签： pandas python 数据分析

于 2022-12-01 08:28:18 首次发布

本文链接：https://blog.csdn.net/qq_35911309/article/details/127463049

版权

Python 专栏收录该内容

30 篇文章 0 订阅

订阅专栏

文章目录

1.将下面字典创建为DataFrame
2.提取含有字符串“python”的行
3.输出df所有列名
4.修改第列名
5.统计grame列中每种编程语言出现的次数
6 将空值用上下值的平均值填充
7 列值大于3的数
8 去重列
9 计算列平均值
10 将列转换列表
11 保存到excel
12 查询行列
13 列值大于3小月7的值
14 交换两列位置
15 列值最大所在行
16 查最后5行数据
17 删最后一行数据
18 增加最后一行数据
19 排序
20 统计列的长度
21 读取Excel文件,csv文件
22 查看df前5行
24 分组并计算平均值
26 createtime提取为月-日
27 查看数值型列的统计
28 根据id将数据分为三组
29 按降序排列
30 取第30行数据
31 取中位数
32 水平频率分布直方图
33 水平密度曲线
34 删列:
35 合并为新的一列
37.计算最大值与最小值之差
38.将第一行与最后一行拼接
39.将第8行数据添加至末尾
40.查看每列的数据类型
附
数据类型转换
随机森林

import pandas as pd
import numpy as np

1.将下面字典创建为DataFrame

data = {'grame':['python','java','c',np.nan,'python'],
'score':['22','33','66','00','66']}
df = pd.DataFrame(data)
print(df)

2.提取含有字符串“python”的行

print(df[df['grame']=='python'])

3.输出df所有列名

print(df.columns)

4.修改第列名

#inplace参数的理解：
#修改一个对象时：
#inplace=True：不创建新的对象，直接对原始对象进行修改；
#inplace=False：对数据进行修改，创建并返回新的对象承载其修改结
colNameDict = {
    'grame':'内容',
    'score':'分数'
}
df.rename(columns=colNameDict,inplace=True)
print(df)

5.统计grame列中每种编程语言出现的次数

df['内容'].value_counts()

DataFramedata_dict={"Grammer":["Python","C","Java","GO",np.nan,"SQL","PHP","python"],
"Score":[1,2,np.nan,4,5,6,7,10]}
df = pd.DataFrame(DataFramedata_dict)

6 将空值用上下值的平均值填充

df['Score']= df['Score'].fillna(df['Score'].interpolate())

7 列值大于3的数

print(df[df['Score']>3])

8 去重列

df = df.drop_duplicates(['Grammer'])
print(df)

9 计算列平均值

print(df['Score'].mean())

10 将列转换列表

print(df['Grammer'].to_list())

11 保存到excel

print(df.to_excel(r'test.xlsx'))

12 查询行列

print(df.shape)

13 列值大于3小月7的值

print(df[(df['Score']>3)&(df['Score']<7)])

14 交换两列位置

#法一
df[['Grammer', 'Score']] = df[['Score', 'Grammer']]
print(df)
#法二
col = df.columns[[1,0]]
print(df[col])

15 列值最大所在行

print(df[df['Score']==df['Score'].max()])

16 查最后5行数据

print(df.tail())

17 删最后一行数据

df.drop(len(df)-1,inplace=True)
print(df)

18 增加最后一行数据

DataFramedata_dict2 ={'Grammer':['php'],'Score':[9]}
df2 = pd.DataFrame(DataFramedata_dict2)
#新版 append被concat取代
#df.append(df2,ignore_index=True)
pd.concat([df, df2])
print(df)

19 排序

df.sort_values('Score',inplace=True)
print(df)

20 统计列的长度

# 填充空值
df['Grammer']=df['Grammer'].fillna('')
# 使用map（lambda ）
df['len_G'] = df['Grammer'].map(lambda x: len(x))
print(df)

21 读取Excel文件,csv文件

dfe = pd.read_excel(r'C:\Users\Administrator\Desktop\目标.xlsx',
header =0,sheet_name=1)
df = pd.read_csv(r'..\..\lh_common_org_region.csv',encoding="GBK")

22 查看df前5行

print(df.head())

24 分组并计算平均值

# 默认对数字求平均值
df.groupby('ORG_REGION_NAME').mean()
# 指定列
print(df.groupby('ORG_REGION_NAME')['id'].mean())

26 createtime提取为月-日

在这里插入图片描述

df['CREATE_DATE'].fillna('01/01/2000 00:00:00',inplace=True)
df['time']= pd.to_datetime(df['CREATE_DATE'],format='%d/%m/%Y %H:%M:%S')
df['月-日']= df['time'].dt.month.astype(int).astype(str)+
"月-"+df['time'].dt.month.astype(int).astype(str)
print(df)

27 查看数值型列的统计

print(df.describe())

28 根据id将数据分为三组

bins = [0,2000, 5000, 10000]
group_names = ['低', '中', '高']
df['categories'] = pd.cut(df['id'], bins, labels=group_names)
print(df)

29 按降序排列

print(df.sort_values(['id'],ascending=False))

30 取第30行数据

print(df.iloc[29])

31 取中位数

print(np.median(df['id']))

32 水平频率分布直方图

import pandas as pd
df = pd.read_csv(r'..\..\lh_common_org_region.csv',encoding="GBK")
import matplotlib as plt
df.ORG_ID.plot(kind="hist")

33 水平密度曲线

df.ORG_ID.plot(kind="kde")

34 删列:

del df['id']
print(df)

35 合并为新的一列

df['test'] = df['ORG_REGION_NAME']+df['ORG_ID'].astype(str)
#df['test'] = df['ORG_REGION_NAME']+df['ORG_ID'].map(str)
print(df)

37.计算最大值与最小值之差

m =df[['id']].apply(lambda x: x.max() - x.min())
print(m)

38.将第一行与最后一行拼接

print(pd.concat([df[:1], df[-1:]]))

39.将第8行数据添加至末尾

df.append(df.iloc[7])

40.查看每列的数据类型

df.dtypes

附

# 查询当前目录
import os
print(os.getcwd())

 # 读取文件
 dd= pd.read_csv(r'C:\Users\admin\Desktop\df_training.csv',encoding="GBK")
#train_data = pd.read_csv(r'C:\Users\admin\Desktop\df_training.csv',encoding="utf-8")

#查看基本信息：
#查看行数和列数：
print(df.shape)
#查看索引，数据类型和内存信息
print(df.info())
#查看数值型列的汇总统计：
print(df.describe())

# 查看缺失情况
print(dd.isnull())
print(dd.isnull().sum())
print(dd.isnull().sum()[dd.isnull().sum()>0])

# 查看列的分类范围
print(dd['REGION_ID'].unique())
# 查看列的数据分布
print(dd['REGION_ID'].value_counts())

# 删除重复的行
print(dd.drop_duplicates(keep='last'))

# 空值填充
print(dd.fillna(dd.mean(),inplace=True))
# 中位数/众数 min/max/mean/median/mode
print(dd.replace(1,'one',inplace=False))
print(dd.replace([1,3],['one','tree'],inplace=True))

# 修改类型
print(dd['id'].astype(float))
print(dd['ORG_ID'].astype(int))

# 导出
df.to_csv(r'C:\Users\admin\Desktop\df_training00.csv',encoding="GBK")

# 编码
print(dd['ORG_ID'].apply(lambda x:2022-int(str(x)[0:4])))

#OneHot编码——get_dummies()函数
data = pd.DataFrame(dd,columns=['id'])
dummies = pd.get_dummies(data)
print(dummies)

# #### 将train_data和test_data合并成all_data
#all_data = pd.____3____
all_data = pd.concat([train_data, test_data])

数据类型转换

# 使用astype()方法强制转化dtype
df.dtypes 			#获取所有列的类型
df.astype(dtype={'工资':'float','时间':'string'},errors='ignore' # 多列转换，dict 映射
df['dept 1'].astype('int',errors='ignore') # 转换为失败，默认报错，也可以忽略 转换失败的错误并保持原样
df.工资.astype(str)				   # 转换为 object素的原样
df.工资.astype('string') 		   # 转换为 string
# pd.to_numeric函数
pd.to_numeric(df.工资)
pd.to_numeric(df.工资,downcast='float') # 指定downcast 目标类型，具体参数值选取参考官方文档
pd.to_numeric(df.company,errors='coerce') # 如果失败，强制将转换失败的转换为nan
pd.to_numeric(df.company,errors='ignore') # 如果失败，忽略并保持原数据不变
pd.to_numeric(df.dept,errors='raise') # 尝试转换成数字，如果失败 默认报错

随机森林

（1）boosting：它组合多个弱学习器形成一个强学习器，且各个弱学习器之间有依赖关系。
（2）bagging：同样的，它也是组合多个弱学习器形成一个强学习器，但它各个弱学习器之间没有依赖关系，而且可以并行拟合。

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #导入随机森林模型
from sklearn.datasets import load_boston #让我们利用sklearn里现有的boston数据
from sklearn.model_selection import train_test_split#用来进行训练集和测试集的自动分配

from sklearn.metrics import classification_report #预测准确率


x_train,x_test,y_train,y_test=
train_test_split(data,target,test_size=0.3,random_state=0)

#定义模型
regr_rf = RandomForestRegressor(n_estimators=100, max_depth=30,                             random_state=2)
# 集合模型
regr_rf.fit(X_train, y_train)
# 利用预测
y_rf = regr_rf.predict(X_test)

print(classification_report(y_test, y_predicted))

hive

--DDL(Data Definition Language)数据定义
--1.1数据库
--1.1.1创建数据库
create database db_hive1;
create database db_hive2 location '/db_hive2';
create database db_hive3 with dbproperties ('create_date' = '2023-11-18');
--1.1.2查看数据库
show databases;
show databases like 'db_hive*';
--查看数据库信息
describe database db_hive3;
desc database extended db_hive3;
--1.1.3修改数据库
--修改dbproperties
alter database db_hive3 set dbproperties ('create_date' = '2023-11-20');

--1.1.4 删除数据库
--若数据库不为空，则会删除失败，默认为该模式
Drop DataBase 【if exists】 db_hive1 【restrict】;
--若数据库不为空，则会将库中的表一并删除
Drop DataBase if exists db_hive1 cascade;

--1.2 表
-- select cast('111' as int);
-- show create table stu
--1.2.1 创建表
create table if not exists student
(
    id   int comment 'id',
    name string
) row format delimited fields terminated by '\t';
--location '/user/hive/warehouse/db_hive1.db/student';

create table student5 like student;

----1.2.2查看表
select *
from student;
show tables;
show create table student;
desc student;
desc formatted student;

--1.2.3 修改表
--修改内部表为外部表
alter table student
    set tblproperties ('EXTERNAL' = 'TRUE');
--修改表名
alter table student2
    rename to student;
--增加列
alter table student
    add columns (gender string);
--修改列
set hive.metastore.disallow.incompatible.col.type.changes =false;
alter table student
    change column gender gender int after id;
--删表
drop table student5;
--清空表
truncate table stu;

----DML(Data Manipulation Language)数据定义
--本地（upload）：
load data local inpath '/opt/modules/apache-hive-3.1.3-bin/student.txt' overwrite into table student;
--HDFS(ctrl+x)：load data inpath '/scott/emp.csv' into table emp;
--上传数据shell:
--hdfs dfs -put student2.txt /user/hive/warehouse/db_hive1.db/student
create table student2 like student;

insert into table student

insert overwrite table student

insert into student2 values (5, 5, "hh"),(6, 6, "ll");

insert overwrite local directory '/opt/modules/apache-hive-3.1.3-bin/lh_data'
    row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
select id,name from student2;

--导出和导入
--hdfs路径
export table db_hive1.student2 to '/opt/modules/apache-hive-3.1.3-bin/lh_data';
import table student3 from '/opt/modules/apache-hive-3.1.3-bin/lh_data'

--2 查询
select id as s_id, name as s_name from student;

select * from student order by name limit 2;
select * from student sort by name;
select * from student distribute by name sort by name;

show functions;
desc function substring;

select round(3.5);--四舍五入
select ceil(3.6);--向上取整
select floor(3.6);--向下取整
select replace("abcd", "a", "A");
select regexp_replace("abb-123-ddd", "\\d+", "*");
select "string" regexp ".*st.*";

--时间函数
select unix_timestamp();
select from_unixtime(1682346879, 'yyyy-MM-dd HH:mm:ss');

select current_date();
select current_timestamp();
select datediff("2023-03-15", "2023-03-19");---4
select date_add("2023-03-15", 4);
select year(replace("2023/03/15", "/", "-"));
select substring(current_date, 1, 7);

--流程控制
select *
from student;

select id,
       name,
       case
           when gender = 1 then "nv"
           when gender = 0 then "nan"
           else "kon"
           end
from student;


select id,
       name,
       case gender
           when 1 then "nv"
           when 0 then "nan"
           else "kon"
           end
from student;--同一字段等值判断


select gender,
       sum(if(id > 1, id, 0)) s,
       sum(if(id > 3, id, 0)) s2,
       count(if(id > 3, id, null))
from student
group by gender;


select if(10 > 5, "正", "负");

select collect_list(name), collect_set(gender)
from student s

--查询锁表
SHOW LOCKS;

SHOW LOCKS LH_OFFER_INST;

UNLOCK table LH_OFFER_INST;

SELECT * from LH_OFFER_INST;