pandas dataframe merge 合并两个df
data = pd.merge(data1, data2, 'left', on=['id'])
更改某列数据类型
data['result'] = data['result'].astype(int)
data['result'] = data['result'].astype(float)
取某几列给新的df
result = data[['id', 'result']]
更改列名
a.rename(columns={'A':'a', 'B':'b', 'C':'c'}, inplace = True)
显示所有列
# 显示所有列
# pd.set_option('display.max_columns', False)
# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
# 这两个参数的默认设置都是False
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
获取第几行第几列
data.ix[1:-1,[3, 4]]
遍历行(不能修改数据)
for index, row in df.iterrows():
print row["c1"], row["c2"]
遍历行并修改数据
def valuation_formula(x, y):
return x * y * 0.5
df['price'] = df.apply(lambda row: valuation_formula(row['x'], row['y']), axis=1)
或者
for index, row in dataset.iterrows():
a, b, c, d = func(row[:-4])
dataset.ix[index, ['2018-1']] = int(a)
dataset.ix[index, ['2018-2']] = int(b)
dataset.ix[index, ['2018-3']] = int(c)
dataset.ix[index, ['2018-4']] = int(d)
修改满足条件的行
evaluation.loc[(evaluation['province'] == row['province']) & (evaluation['model'] == row['model']) & (
evaluation['regMonth'] == 1), 'forecastVolum'] = row['2018-1']
某列最大 /小值
df['row'].max()
df最大最小值
df.max().max()
查看各列数据类型
df.dtypes
按条件取行
df = df[df['STATUS'] == "ACTIVE"]
两个df合并
result = left.join(right, on='key')
将某一列转化成list
df['a'].tolist()
字符串与时间格式相互转化
import datetime
# str转时间格式:
dd = '2019-03-17 11:00:00'
dd = datetime.datetime.strptime(dd, "%Y-%m-%d %H:%M:%S")
print(dd,type(dd))
# 时间格式转str:
dc = dd.strftime("%Y-%m-%d %H:%M:%S")
print(dc,type(dc))
创建df
df = pd.DataFrame(columns=[???])
逐行追加,增加一行
df.loc[len(df)] = [16, 17, 18, 19]
统计某列元素出现的个数(唯一)
df.value_counts()
df排序
log_df = log_df.sort_values(['datetime', 'blockID'])
df分组,并进行迭代
for blockID, group in df.groupby('key1'):
print (blockID)
print (group)
df某列转化成list
# 方法1
df['a'].values.tolist()
# 方法2
df['a'].tolist()
# 方法3(不重复元素)
df['a'].drop_duplicates().values.tolist()
list去除重复元素,不改变顺序
word_list = sorted(set(word_list), key=word_list.index)
写文件
with open(path + 'log_seq.txt', 'w', encoding='utf-8') as f:
f.write(log_seq)
获取某列的唯一值
data.drop_duplicates(['name'])['name']
将秒转化成时分秒
import time
print(time.strftime("%H:%M:%S", time.gmtime(666)))
读txt文件
# 读txt文件
def read_data(path):
content = []
file = open(path, 'r', encoding='UTF-8')
for line in file:
content.append(line.strip())
file.close()
return content
读csv文件
# 读csv文件
def read_data_csv(path):
birth_data = []
with open(path, encoding="utf-8") as csvfile:
csv_reader = csv.reader(csvfile) # 使用csv.reader读取csvfile中的文件
birth_header = next(csv_reader) # 读取第一行每一列的标题
for row in csv_reader: # 将csv 文件中的数据保存到birth_data中
birth_data.append(row)
return birth_data
写入csv文件
# 写入csv文件
def write_data_csv(path, data):
with open(path, "a", encoding="utf-8", newline='') as csvfile:
writer = csv.writer(csvfile)
# 写入多行用writerows
temp = data.split(',')
temp_list = []
for i in range(len(temp)):
temp_list.append(temp[i])
writer.writerows([temp_list])
文本预处理:中文分词,去听用词
# 文本预处理:中文分词,去除停用词
import jieba
def pre_work(string):
# 分词,去停用词
words = ' '.join(jieba.cut(string)).split(' ') # 利用jieba工具进行中文分词
# 获取停用词
stopwords = set()
file = open("stopwords.txt", 'r', encoding='UTF-8')
for line in file:
stopwords.add(line.strip())
file.close()
text = []
# 过滤停用词,只保留不属于停用词的词语
for word in words:
if word not in stopwords:
text.append(word)
return text
pandas归一化(放到另外一列)
loss_df['loss_normalization'] = 0
d = loss_df['loss'].apply(lambda x: (x - loss_df['loss'].min()) / (loss_df['loss'].max() - loss_df['loss'].min()))
print(d)
loss_df['loss_normalization'] = d