删除某列或某些列
merge_new = merge.drop(columns=['acc.version','acc','taxid','ig'])
#Drop rows
>>>df.drop([0, 1])
A B C D
2 8 9 10 11
>>> df.drop(index=[0, 1])
A B C D
2 8 9 10 11
获取行列数
print(df.shape)
# (891, 12)
print(df.shape[0])
# 891
print(df.shape[1])
# 12
列求和
import pandas as pd
frame = pd.read_excel("/home/huangdan/10.Defense/1.xlsx")
#所有列求和
frame_sum = frame.sum()
#p.s.开始的时候并没有加入列头:(pandas事实上是默认第一行是列头的)
frame = pd.read_excel("/home/huangdan/10.Defense/1.xlsx",header=None, names= ["system","C1-1-1",...,"D5-4-4"])
或者:
frame = frame[["system", "C1-1-1", "C1-1-2",。。。。"D5-4-4"]] #附列名
group = frame[[ "C1-1-1", "C1-1-2",。。。。"D5-4-4"]].groupby(frame["system"]).sum() #根据system求和
#列求占比,比如相对丰度
# 1. 选择数据的数值部分(排除第一列)
numeric_cols = frame.iloc[:, 1:]
# 2. 计算相对丰度
rel_abun = numeric_cols.div(numeric_cols.sum(axis=0), axis=1) * 100
# 3. 将相对丰度和第一列(Contig)重新组合
rel_abun = pd.concat([frame.iloc[:, [0]], rel_abun], axis=1)
# 输出相对丰度表格
print(rel_abun)
#p.s.报错想查看列名:
print([column for column in df_split])
group.head() #检查
group.to_excel("/home/huangdan/10.Defense/p.xlsx")
或者:
group.to_csv("/home/huangdan/10.Defense/p.csv")
列统计个数
kegg_pathway = pd.read_table( "/public/home/xiaoxuan/mydata/hnmeta/04-annotation_gene/all.annotations.KEGG_Pathway.txt",header=None,names=["geneID", "ko"])
kegg_pathway.loc[:, ["geneID", "ko"]].groupby(by="ko").count()
Out[18]:
geneID
ko
ko00010 368609
ko00020 287531
ko00030 207436
ko00040 98180
ko00051 141445
... ...
map05410 2672
map05412 29
map05414 373
map05416 5148
map05418 39290
vlookup
import pandas as pd
# defence
df_defence = pd.read_table( "/public/home/xiaoxuan/mydata/hnmeta/04-annotation/DB_clu_tax.txt", header=None,names=["#ID"])
df_defence.head()
# abundance
df_abundance = pd.read_csv("/public/home/xiaoxuan/mydata/hnmeta/05-abundance/abundance-tpmean.csv")
df_abundance.head()
df_merge = pd.merge(df_abundance, df_defence, on="#ID", how="inner")
df_merge.head()
分列
#分裂成单独表:
df_split = df_merge["Tax"].str.split(";", expand=True)
#加入原来的列:
pd.concat([datas'].str.split(',', expand=True)], axis=1)
列去重
import pandas as pd
c = pd.read_table( "/public/home/xiaoxuan/mydata/hnmeta/07-hostpre/trna-pro/v-b_match_100.out")
c.duplicated() #查看是否有重复
#输出可见:Out[4]:
0 False
1 False
2 False
3 True
4 False
...
23999 False
24000 True
24001 False
24002 False
24003 True
c.drop_duplicates() #去重
#Duplicated( )和drop_duplicates( )方法是以默认的方式判断全部的列,
#特定的列进行重复项判断与去重
c.duplicated(['a']) c.drop_duplicates(['a'])
norepeat_df = df.drop_duplicates(subset=['A_ID', 'B_ID'], keep='first')
#上面的命令去掉UNIT_ID和KPI_ID列中重复的行,并保留重复出现的行中第一次出现的行
补充:
当keep=False时,就是去掉所有的重复行
当keep=‘first’时,就是保留第一次出现的重复行
当keep=’last’时就是保留最后一次出现的重复行。
(注意,这里的参数是字符串,要加引号!!!)
————————————————
版权声明:本文为CSDN博主「大白羊的进阶之路」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_40981268/article/details/86498762
分列2
>>> import pandas as pd
>>> data = pd.read_table("/public/home/xiaoxuan/mydata/hnmeta/04-annotation/23redo_contig/acc_taxid_taxon_filter_all.xls")
>>> data.head()
contig acc.version acc taxid ig taxon
0 Unicontig65 WP_028651014.1 WP_028651014 433660 655239724 k__Bacteria\tp__Actinomycetota\tc__Actinomycet...
1 Unicontig97 OFW01544.1 OFW01544 1797185 1082151309 k__Bacteria\tp__Acidobacteriota\tc__unclassifi...
2 Unicontig289 TML78276.1 TML78276 2900548 1671928630 k__Bacteria\tp__Actinomycetota\tc__unclassifie...
3 Unicontig321 OLB53802.1 OLB53802 1805204 1125201363 k__Bacteria\tp__Gemmatimonadota\tc__unclassifi...
4 Unicontig353 PYM69815.1 PYM69815 2053607 1400225245 k__Bacteria\tp__Candidatus Rokubacteria\tc__un...
>>> new = pd.concat([data, data['taxon'].str.split('\t', expand=True)], axis=1)
>>> new.head()
contig acc.version ... 5 6
0 Unicontig65 WP_028651014.1 ... g__Nocardioides s__Nocardioides halotolerans
1 Unicontig97 OFW01544.1 ... g__unclassified Acidobacteriota genus s__Acidobacteria bacterium RIFCSPLOWO2_02_FULL...
2 Unicontig289 TML78276.1 ... g__unclassified Actinomycetota genus s__Actinomycetota bacterium
3 Unicontig321 OLB53802.1 ... g__unclassified Gemmatimonadota genus s__Gemmatimonadetes bacterium 13_2_20CM_2_69_23
4 Unicontig353 PYM69815.1 ... g__unclassified Candidatus Rokubacteria genus s__Candidatus Rokubacteria bacterium
[5 rows x 13 columns]
```>>> new.rename(columns={0:'kingdom',1:'phylum',2:'class',3:'order',4:'family',5:'genus',6:'species'},inplace=True)
>>> new.head()
contig acc.version ... genus species
0 Unicontig65 WP_028651014.1 ... g__Nocardioides s__Nocardioides halotolerans
1 Unicontig97 OFW01544.1 ... g__unclassified Acidobacteriota genus s__Acidobacteria bacterium RIFCSPLOWO2_02_FULL...
2 Unicontig289 TML78276.1 ... g__unclassified Actinomycetota genus s__Actinomycetota bacterium
3 Unicontig321 OLB53802.1 ... g__unclassified Gemmatimonadota genus s__Gemmatimonadetes bacterium 13_2_20CM_2_69_23
4 Unicontig353 PYM69815.1 ... g__unclassified Candidatus Rokubacteria genus s__Candidatus Rokubacteria bacterium
[5 rows x 13 columns]
>>> new.to_csv("/public/home/xiaoxuan/mydata/hnmeta/04-annotation/23redo_contig/results.xls")
#查看列名
>>> df_tax = pd.read_table("/public/home/xiaoxuan/bxdata/04.annotation/contig-kraken2/contig_taxon_result.xls")
>>> df_tax.head()
gene accession.version ... genus species
0 ular organisms superkingdom c__unclassified cellular organisms class ... NaN NaN
1 b73l-remix_k127_8424726 2017486 ... g__Nocardioides s__Nocardioides sp. S5
2 b73l-remix_k127_8023556 1639348 ... g__Magnetospirillum s__Magnetospirillum sp. ME-1
3 b73l-remix_k127_4814159 2712222 ... g__Nordella s__Nordella sp. HKS 07
4 b73l-remix_k127_5081620 2609290 ... g__Microbacterium s__unclassified Microbacterium species
[5 rows x 10 columns]
>>> print([column for column in df_tax])
['gene', 'accession.version', 'taxid', 'kindom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
#修改列名
import pandas as pd
df = pd.DataFrame({'a':[1,2,3],'b':[1,2,3]})
如下:
a b
0 1 1
1 2 2
2 3 3
1、修改列名a,b为A、B。
df.columns = ['A','B']
2、只修改列名a为A
df.rename(columns={'a':'A'})