目录
对dataframe列处理后增加一列,lambda函数应用在dataframe
创建dataframe
#有值
df1 = pd.DataFrame(data1, columns = ["a", "b"])
#空
df1 = pd.DataFrame(columns = ["a", "b"])
ndarray转化为DataFrame
In:type(y)
Out:numpy.ndarray
from pandas import DataFrame
pre = DataFrame(y,columns=['pre'])
import pandas as pd
pre = pd.DataFrame(y,columns=['pre'])
In:type(pre)
Out:pandas.core.frame.DataFrame
在Dataframe中添加列
#添加空白列
df1['q'] = None
#添加所有值为1的列
df1['p'] = '1'
Dataframe中一列转为list
# 方法1
df['a'].values.tolist()
# 方法2
df['a'].tolist()
Dataframe重置索引
import pandas as pd
#不保留原来索引
df_new = df.reset_index(drop=True)
#保留原来索引
df_new2 = df.reset_index()
nan处理
检查有多少NAN
print (rawdata.isnull().sum())
Patent Number 0
Main Target-based Actions 3607
Target-based Actions 3607
dtype: int64
将含有NAN的行去掉
data_without_NaN =rawdata.dropna(axis=0)
改变列格式
df['a'] = df['a'].astype('str')#str为改变后的格式
值替换https://zhuanlan.zhihu.com/p/30829387替换 replace()及部分替换
#A、B、C的值替换为1、2、3
df3 = df.replace({'A">\n</td>\n':'1',
'B"/>\n</td>\n':'2',
'C"/>\n</td>\n':'3'})
对dataframe列处理后增加一列,lambda函数应用在dataframe
#列处理,去除首尾空格
df00['id'] = df0['id'].apply(lambda x :x.strip())
#正则匹配合适的文本
chapter_test_ch['ch'] = chapter_test.EntryDescription.apply(lambda x:re.sub(r'[^\u4e00-\u9fa5]', "",x))
#自定义函数后处理
def split_apro(x):
return x.split(' ' ,3)[1].replace('(','').strip()
df0['AproveNo'] = df0.Dname.apply(lambda x:split_apro(x))
##对每一行做 lambda 处理
df_cot['匹配结果PJB'] = df_cot.apply(lambda x:sim_l_com_name(x,df_PJB,'Citeline Drug ID'),axis=1)
排序
##df排序
df_sort = df_hsa.sort_values(axis=0,ascending=False,by=['colmn1','column2'])
##多层列表排序
s1=[['hjh','ghg','8990','0.8'],['hgfgh','dgg','28450','0.5']]
s2 = sorted(s1,key =lambda x:(x[3],x[2]),reverse=True)
文本、拆分、大小写统一、去重
def m_split3(intxt):
##加去掉括号内文本
inttxt_nokuohao = re.sub('\\(.*?\\)','',intxt)
l = [w.strip().lower() for w in re.split(',|;|\n',inttxt_nokuohao)]
l = list(set(l))
#企业名称后缀去除
stoplist_com = [' Inc.','Inc','Inc.',' Inc','inc','inc.','Ltd.','Ltd',' Ltd.','Ltd',
'ltd','ltd.',' S.A.',' S.A','S.A.','S.A','sa','s.a.','sa.','s.p.a.',
'corp','corp.',' Plc.',' Plc','Plc.','Plc','plc','plc.', 'LLC','LLC',
'l.p.','nv','n.v.','co','co.','ag','sa','pharma gmbh','llc','pharmaceuticals',
'pharmaceutical','pharma','spa','nordisk','a/s','ab','development',
'aps','laboratories','&','health companies','holding','eli',
'life','science','limited.','limited','therapeutics','therapeutic']
def m_split4(intxt):
##适用company
inttxt_nokuohao = re.sub('\\(.*?\\)','',intxt)
l = [w.strip().lower() for w in re.split(',|;|\n',inttxt_nokuohao) if w not in stoplist_com]
s = []
for i in l:
ll = [w for w in i.split(' ') if w not in stoplist_com]
s.append(' '.join(ll))
s = list(set(s))
return s
找出两个list中最相似的
import Levenshtein
#匹配2个,返回前两个最相似的值
def sim_str(l1,l2):
x = []
for i in l1:
for j in l2:
s = Levenshtein.ratio(i,j)
x.append(s)
x.sort(reverse=True)
if len(x)>1:
v = x[0]+x[1]
return v
else:
v = x[0]+x[0]
return v
#匹配,得到最高的相似值
def sim_str_1(l1,l2):
x = []
for i in l1:
for j in l2:
s = Levenshtein.ratio(i,j)
x.append(s)
x.sort(reverse=True)
v = x[0]
return v
##l1为list,对dataframe遍历
def sim_l_df(l1,df_2,ID_columnname):
s = []
for index,row in df_2.iterrows():
l2 = row['Names_clean']
v = sim_str(l1,l2)
ID = row[ID_columnname]
s.append([v,ID,l2])
s.sort(reverse=True)
return s[0]
pdf解析
提取表
2、pdf另存为excel
3、python包
import tabula
df = tabula.read_pdf("test.pdf", pages='all')