panda处理表格类信息

删除某列或某些列

merge_new = merge.drop(columns=['acc.version','acc','taxid','ig'])

#Drop rows
>>>df.drop([0, 1])

   A  B   C   D

2  8  9  10  11

>>> df.drop(index=[0, 1])

   A  B   C   D
   
2  8  9  10  11

获取行列数

print(df.shape)
# (891, 12)

print(df.shape[0])
# 891

print(df.shape[1])
# 12

列求和

import pandas as pd
frame = pd.read_excel("/home/huangdan/10.Defense/1.xlsx")
#所有列求和
frame_sum = frame.sum()

#p.s.开始的时候并没有加入列头:(pandas事实上是默认第一行是列头的)
frame = pd.read_excel("/home/huangdan/10.Defense/1.xlsx",header=None, names= ["system","C1-1-1",...,"D5-4-4"])
或者:
frame  = frame[["system", "C1-1-1", "C1-1-2",。。。。"D5-4-4"]] #附列名
group = frame[[ "C1-1-1", "C1-1-2",。。。。"D5-4-4"]].groupby(frame["system"]).sum() #根据system求和

#列求占比,比如相对丰度
# 1. 选择数据的数值部分(排除第一列)
numeric_cols = frame.iloc[:, 1:]
# 2. 计算相对丰度
rel_abun = numeric_cols.div(numeric_cols.sum(axis=0), axis=1) * 100
# 3. 将相对丰度和第一列(Contig)重新组合
rel_abun = pd.concat([frame.iloc[:, [0]], rel_abun], axis=1)
# 输出相对丰度表格
print(rel_abun)


#p.s.报错想查看列名:
print([column for column in df_split])

group.head() #检查
group.to_excel("/home/huangdan/10.Defense/p.xlsx")
或者:
group.to_csv("/home/huangdan/10.Defense/p.csv")

列统计个数

 kegg_pathway = pd.read_table( "/public/home/xiaoxuan/mydata/hnmeta/04-annotation_gene/all.annotations.KEGG_Pathway.txt",header=None,names=["geneID", "ko"])
kegg_pathway.loc[:, ["geneID", "ko"]].groupby(by="ko").count()
	Out[18]:
	          geneID
	ko
	ko00010   368609
	ko00020   287531
	ko00030   207436
	ko00040    98180
	ko00051   141445
	...          ...
	map05410    2672
	map05412      29
	map05414     373
	map05416    5148
	map05418   39290

vlookup

import pandas as pd

# defence
df_defence = pd.read_table( "/public/home/xiaoxuan/mydata/hnmeta/04-annotation/DB_clu_tax.txt", header=None,names=["#ID"])
df_defence.head()
 
# abundance
df_abundance = pd.read_csv("/public/home/xiaoxuan/mydata/hnmeta/05-abundance/abundance-tpmean.csv")
df_abundance.head()

df_merge = pd.merge(df_abundance, df_defence, on="#ID", how="inner")

df_merge.head()

分列

#分裂成单独表:
df_split = df_merge["Tax"].str.split(";", expand=True)
#加入原来的列:
pd.concat([datas'].str.split(',', expand=True)], axis=1)


列去重

import pandas as pd
c = pd.read_table( "/public/home/xiaoxuan/mydata/hnmeta/07-hostpre/trna-pro/v-b_match_100.out")
c.duplicated() #查看是否有重复
	#输出可见:Out[4]:
	0        False
	1        False
	2        False
	3         True
	4        False
	         ...
	23999    False
	24000     True
	24001    False
	24002    False
	24003     True
c.drop_duplicates() #去重

#Duplicated( )和drop_duplicates( )方法是以默认的方式判断全部的列,
#特定的列进行重复项判断与去重
c.duplicated(['a'])      c.drop_duplicates(['a'])



norepeat_df = df.drop_duplicates(subset=['A_ID', 'B_ID'], keep='first')

#上面的命令去掉UNIT_ID和KPI_ID列中重复的行,并保留重复出现的行中第一次出现的行

补充: 
当keep=False时,就是去掉所有的重复行 
当keep=‘first’时,就是保留第一次出现的重复行 
当keep=’last’时就是保留最后一次出现的重复行。 
(注意,这里的参数是字符串,要加引号!!!)

 
————————————————
版权声明:本文为CSDN博主「大白羊的进阶之路」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_40981268/article/details/86498762

分列2

>>> import pandas as pd
>>> data = pd.read_table("/public/home/xiaoxuan/mydata/hnmeta/04-annotation/23redo_contig/acc_taxid_taxon_filter_all.xls")
>>> data.head()
         contig     acc.version           acc    taxid          ig                                              taxon
0   Unicontig65  WP_028651014.1  WP_028651014   433660   655239724  k__Bacteria\tp__Actinomycetota\tc__Actinomycet...
1   Unicontig97      OFW01544.1      OFW01544  1797185  1082151309  k__Bacteria\tp__Acidobacteriota\tc__unclassifi...
2  Unicontig289      TML78276.1      TML78276  2900548  1671928630  k__Bacteria\tp__Actinomycetota\tc__unclassifie...
3  Unicontig321      OLB53802.1      OLB53802  1805204  1125201363  k__Bacteria\tp__Gemmatimonadota\tc__unclassifi...
4  Unicontig353      PYM69815.1      PYM69815  2053607  1400225245  k__Bacteria\tp__Candidatus Rokubacteria\tc__un...


>>> new = pd.concat([data, data['taxon'].str.split('\t', expand=True)], axis=1)
>>> new.head()
         contig     acc.version  ...                                              5                                                  6
0   Unicontig65  WP_028651014.1  ...                                g__Nocardioides                       s__Nocardioides halotolerans
1   Unicontig97      OFW01544.1  ...          g__unclassified Acidobacteriota genus  s__Acidobacteria bacterium RIFCSPLOWO2_02_FULL...
2  Unicontig289      TML78276.1  ...           g__unclassified Actinomycetota genus                        s__Actinomycetota bacterium
3  Unicontig321      OLB53802.1  ...          g__unclassified Gemmatimonadota genus    s__Gemmatimonadetes bacterium 13_2_20CM_2_69_23
4  Unicontig353      PYM69815.1  ...  g__unclassified Candidatus Rokubacteria genus               s__Candidatus Rokubacteria bacterium

[5 rows x 13 columns]

```>>> new.rename(columns={0:'kingdom',1:'phylum',2:'class',3:'order',4:'family',5:'genus',6:'species'},inplace=True)
>>> new.head()
         contig     acc.version  ...                                          genus                                            species
0   Unicontig65  WP_028651014.1  ...                                g__Nocardioides                       s__Nocardioides halotolerans
1   Unicontig97      OFW01544.1  ...          g__unclassified Acidobacteriota genus  s__Acidobacteria bacterium RIFCSPLOWO2_02_FULL...
2  Unicontig289      TML78276.1  ...           g__unclassified Actinomycetota genus                        s__Actinomycetota bacterium
3  Unicontig321      OLB53802.1  ...          g__unclassified Gemmatimonadota genus    s__Gemmatimonadetes bacterium 13_2_20CM_2_69_23
4  Unicontig353      PYM69815.1  ...  g__unclassified Candidatus Rokubacteria genus               s__Candidatus Rokubacteria bacterium

[5 rows x 13 columns]

>>> new.to_csv("/public/home/xiaoxuan/mydata/hnmeta/04-annotation/23redo_contig/results.xls")

#查看列名
>>> df_tax = pd.read_table("/public/home/xiaoxuan/bxdata/04.annotation/contig-kraken2/contig_taxon_result.xls")
>>> df_tax.head()
                          gene                         accession.version  ...                genus                                 species
0  ular organisms superkingdom  c__unclassified cellular organisms class  ...                  NaN                                     NaN
1      b73l-remix_k127_8424726                                   2017486  ...      g__Nocardioides                  s__Nocardioides sp. S5
2      b73l-remix_k127_8023556                                   1639348  ...  g__Magnetospirillum            s__Magnetospirillum sp. ME-1
3      b73l-remix_k127_4814159                                   2712222  ...          g__Nordella                  s__Nordella sp. HKS 07
4      b73l-remix_k127_5081620                                   2609290  ...    g__Microbacterium  s__unclassified Microbacterium species

[5 rows x 10 columns]

>>> print([column for column in df_tax])
['gene', 'accession.version', 'taxid', 'kindom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

#修改列名
import pandas as pd
df = pd.DataFrame({'a':[1,2,3],'b':[1,2,3]})
如下:
   a  b
0  1  1
1  2  2
2  3  3
1、修改列名a,b为A、B。
df.columns = ['A','B']
2、只修改列名a为A
df.rename(columns={'a':'A'})
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值