// How to cast an Object to an int// https://stackoverflow.com/questions/3661413/how-to-cast-an-object-to-an-int@DeprecatedpublicstaticinttoInt(Object obj){if(obj instanceofString){returnInteger.parseInt((String) obj);}elseif(obj instanceofNumber){return((Number) obj).intValue();}else{String toString = obj.toString();if(toString.matches("-?\d+")){returnInteger.parseInt(toString);}thrownewIllegalArgumentException("This Object doesn't represent an int");}}// Convert object to int in Java// http://www.java2s.com/Tutorials/Java/Data_Type/Integer/Convert_object_to_int_in_Java.htm/*w w w . j a v a 2 s. co m*/publicclassMain{publicstaticinttoInt(Object object,int defaultValue){if(object ==null){return defaultValue;}elseif(object instanceofNumber){return((Number) object).intValue();}elseif(object instanceofString){try{returnInteger.parseInt((String) object);}catch(NumberFormatException ex){return defaultValue;}}return defaultValue;}publicstaticvoidmain(String[] argv){System.out.println(toInt('1',0));}}
# https://stackoverflow.com/questions/30577375/have-numpy-argsort-return-an-array-of-2d-indices>>> arr = np.array([[5,2,4],[3,3,3],[6,1,2]])>>> np.dstack(np.unravel_index(np.argsort(arr.ravel()),(3,3)))# https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dictfrom pyspark.sql.types import StringType
from pyspark.sql.functions import udf
deftranslate(mapping):deftranslate_(col):return mapping.get(col)return udf(translate_, StringType())
df = sc.parallelize([('DS',),('G',),('INVALID',)]).toDF(['key'])
mapping ={'A':'S','B':'S','C':'S','DS':'S','DNS':'S','E':'NS','F':'NS','G':'NS','H':'NS'}
df.withColumn("value", translate(mapping)("key"))## https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframefrom pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
indexers =[StringIndexer(inputCol=column, outputCol=column+"_index").fit(df)for column inlist(set(df.columns)-set(['date']))]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(df).transform(df)# https://stackoverflow.com/questions/36122559/how-to-map-variable-names-to-features-after-pipeline
val transformedDF = pipelineModel.transform(df)
val meta: org.apache.spark.sql.types.Metadata = transformedDF
.schema(transformedDF.schema.fieldIndex("features")).metadata
meta.getMetadata("ml_attr").getMetadata("attrs")// org.apache.spark.sql.types.Metadata ={"binary":[//{"idx":0,"name":"e"},{"idx":1,"name":"f"},{"idx":2,"name":"a"},//{"idx":3,"name":"b"},{"idx":4,"name":"c"}]}# Getting labels from StringIndexer stages within pipeline in Spark (pyspark)# https://stackoverflow.com/questions/45885044/getting-labels-from-stringindexer-stages-within-pipeline-in-spark-pysparkfrom pyspark.ml.feature import StringIndexer, StringIndexerModel
df = spark.createDataFrame([("a","foo"),("b","bar")],("x1","x2"))
pipeline = Pipeline(stages=[
StringIndexer(inputCol=c, outputCol='{}_index'.format(c))for c in df.columns
])
model = pipeline.fit(df)# Accessing _java_obj shouldn't be necessary in Spark 2.3+{x._java_obj.getOutputCol(): x.labels for x in model.stages ifisinstance(x, StringIndexerModel)}
indexed = model.transform(df){c.name: c.metadata["ml_attr"]["vals"]for c in indexed.schema.fields if c.name.endswith("_index")}# Pyspark ML - How to save pipeline and RandomForestClassificationModel# https://stackoverflow.com/questions/44981407/pyspark-ml-how-to-save-pipeline-and-randomforestclassificationmodelfrom pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")# Index labels, adding metadata to the label column.# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
labels = label_indexer.fit(data).labels
# Automatically identify categorical features, and index them.# Set maxCategories so features with > 4 distinct values are treated as continuous.
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)
early_stages =[label_indexer, feature_indexer]# Split the data into training and test sets (30% held out for testing)(train, test)= data.randomSplit([0.7,0.3])# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)# Convert indexed labels back to original labels.
label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labels)# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=early_stages +[rf, label_converter])# Train model. This also runs the indexers.
model = pipeline.fit(train)
model.save("/tmp/rf")>>> rf_model = model.stages[2]>>>print(rf_model)
RandomForestClassificationModel (uid=rfc_b368678f4122)with10 trees
>>> rf_model.save("/tmp/rf_2")# Split Spark Dataframe string column into multiple columns# https://stackoverflow.com/questions/39235704/split-spark-dataframe-string-column-into-multiple-columns
split_col = pyspark.sql.functions.split(df['my_str_col'],'-')
df = df.withColumn('NAME1', split_col.getItem(0))
df = df.withColumn('NAME2', split_col.getItem(1))# Select column name per row for max value in PySpark
data =[(("ID1",3,5,78)),(("ID2",4,12,45)),(("ID3",70,3,67))]
df = spark.createDataFrame(data,["ID","colA","colB","colC"])
df.show()
cols = df.columns
# to get max of values in a row
maxcol = F.udf(lambda row:max(row), IntegerType())
maxDF = df.withColumn("maxval", maxcol(F.struct([df[x]for x in df.columns[1:]])))
maxDF.show()