pandas编程技巧3
-
np.nan_to_num(x)
使用0替代数组x中的nan,使用有限的数字代替inf元素
使用背景:在是使用numpy数组时,常常会出现nan或者inf的元素,可能会造成数值计算的一些错误。可以使用numpy库的函数nan_to_num(),使得nan和inf能够最简单地转换成相应的数值。
from pandas import DataFrame, Series import numpy as np a = np.array([ [np.nan, np.inf, 0, 10], [np.inf, np.nan, np.nan, 3] ]) print(a) print("nan_tp_num()转换后的结果:") print(np.nan_to_num(a)) ----------------------------result---------------------- [[nan inf 0. 10.] [inf nan nan 3.]] nan_tp_num()转换后的结果: [[0.00000000e+000 1.79769313e+308 0.00000000e+000 1.00000000e+001] [1.79769313e+308 0.00000000e+000 0.00000000e+000 3.00000000e+000]]
-
SparkSession.createDataFrame()
可以获得从 rdd、 python list 和 pandas df 创建 df 的能力
在 Pyspark 操纵 spark-SQL 的世界里借助 session 这个客户端来对内容进行操作和计算
spark sql的相关链接请参考:https://www.cnblogs.com/piperck/p/10446720.html
from pyspark.sql import Row from pyspark import SparkConf,SparkContext, SQLContext from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, FloatType def parse_line(line): ll = line.strip().split(",") return ll[0], ll[1] if __name__ == "__main__": conf = SparkConf() conf.setAppName("test createDataFrame function...") sc = SparkContext(conf=conf) spark = SQLContext(sc) data_rdd = sc.textFile("/user/xxxx/test/test.txt").map(parse_line) #print(data_rdd) schema = StructType([StructField("id", StringType(), True), StructField("user_name", StringType(), True) ]) data_df = spark.createDataFrame(data_rdd,schema=schema) #data_df.printSchema #data_df.show(false) print("*"*30, data_df) print("*"*30, data_df.collect()) data_df_pandas = data_df.toPandas() #将spark的DataFrame转化为pandas的dataFrame print("-"*30, data_df_pandas) -------------------------------result------------------ ('******************************', DataFrame[id: string, user_name: string]) ('******************************', [Row(id=u'aa', user_name=u'11'), Row(id=u'bb', user_name=u'22'), Row(id=u'cc', user_name=u'33'), Row(id=u'dd', user_name=u'44')]) ------------------------------ id user_name 0 aa 11 1 bb 22 2 cc 33 3 dd 44 附件: text.txt的内容: aa,11 bb,22 cc,33 dd,44
-
loc和iloc的区别
loc():只能用index行标和columns列标来筛选数据,即不能出现数字下标索引
iloc():只能用数字下标索引来筛选字段,即不能出现字符串
from pandas import DataFrame, Series from pandas import DataFrame, Series df = DataFrame([ [1,2,3,4], [5,6,7,8], [9,10,11,12] ]) df.columns = list("abcd") df.index = list("123") print(df) print("*"*10) print(df.loc["1":"2","a"]) print("*"*10) print(df["a"]) print("*"*10) print(df.iloc[:2, :2]) print("*"*30) print(df["a"][:3]) #先筛选第"a"列,再筛选其前3行 print(df[:3][["a","b"]]) #先筛选前3行想,再筛选其"a"和"b"列 ----------------------------result------------------- a b c d 1 1 2 3 4 2 5 6 7 8 3 9 10 11 12 ********** 1 1 2 5 Name: a, dtype: int64 ********** 1 1 2 5 3 9 Name: a, dtype: int64 ********** a b 1 1 2 2 5 6 ****************************** 1 1 2 5 3 9 Name: a, dtype: int64 a b 1 1 2 2 5 6 3 9 10
-
rename()
from pandas import DataFrame, Series df = DataFrame([ [1,2,3,4], [5,6,7,8], [9,10,11,12] ]) df.columns = list("abcd") df.index = list("123") print(df) print("*"*10) df.rename({"a":"aa"}, axis = 1, inplace=True) print(df) ------------------------result------------------ a b c d 1 1 2 3 4 2 5 6 7 8 3 9 10 11 12 ********** aa b c d 1 1 2 3 4 2 5 6 7 8 3 9 10 11 12
-
apply()
from pandas import DataFrame, Series info = [ {"order_id":1001, "name":"wangwu", "score_1":90, "score_2":85}, {"order_id":1002, "name":"zhangsan", "score_1":60, "score_2":75}, {"order_id":1003, "name":"lisi", "score_1":96, "score_2":95}, {"order_id":1001, "name":"wang wu", "score_1":89, "score_2":90} ] df = DataFrame(info) df["score1_level"] = df.apply(lambda x: 1 if x.score_1 > 90 else (0 if x.score_1 > 60 else -1), axis = 1) df["score2_level"] = df.apply(lambda x: 1 if x.score_2 > 90 else (0 if x.score_2 > 60 else -1), axis = 1) df["score_total"] = df.apply(lambda x: x.score_1 + x.score_2, axis = 1) print(df) --------------------------result--------------------- order_id name score_1 score_2 score1_level score2_level score_total 0 1001 wangwu 90 85 0 0 175 1 1002 zhangsan 60 75 -1 0 135 2 1003 lisi 96 95 1 1 191 3 1001 wang wu 89 90 0 0 179
-