代码 snippet

wgdzz

已于 2022-06-11 16:22:55 修改

阅读量161

点赞数

分类专栏：编程文章标签： java 开发语言

于 2022-06-11 16:14:47 首次发布

本文链接：https://blog.csdn.net/wgdzz/article/details/125235762

版权

编程专栏收录该内容

19 篇文章 0 订阅

订阅专栏

// How to cast an Object to an int
// https://stackoverflow.com/questions/3661413/how-to-cast-an-object-to-an-int

@Deprecated
public static int toInt(Object obj)
{
    if (obj instanceof String)  {
         return Integer.parseInt((String) obj);
    } 
    else if (obj instanceof Number) {
         return ((Number) obj).intValue();
    } 
    else {
         String toString = obj.toString();
         if (toString.matches("-?\d+"))  {
              return Integer.parseInt(toString);
         }
         throw new IllegalArgumentException("This Object doesn't represent an int");
    }
}

// Convert object to int in Java
// http://www.java2s.com/Tutorials/Java/Data_Type/Integer/Convert_object_to_int_in_Java.htm
/*w w w .  j a  v a 2 s.  co m*/
public class Main {
  public static int toInt(Object object, int defaultValue) {
    if (object == null) {
        return defaultValue;
    }
    else if (object instanceof Number) {
        return ((Number) object).intValue();
    }
    else if (object instanceof String) {
        try {
            return Integer.parseInt((String) object);
        }
        catch (NumberFormatException ex) {
            return defaultValue;
        }
    }

    return defaultValue;
  }

  public static void main(String[] argv){
    System.out.println(toInt('1',0));
  }
}

# https://stackoverflow.com/questions/30577375/have-numpy-argsort-return-an-array-of-2d-indices
>>> arr = np.array([[5, 2, 4],
[3, 3, 3],
[6, 1, 2]])
>>> np.dstack(np.unravel_index(np.argsort(arr.ravel()), (3, 3)))



# https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dict
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def translate(mapping):
    def translate_(col):
        return mapping.get(col)
    return udf(translate_, StringType())

df = sc.parallelize([('DS', ), ('G', ), ('INVALID', )]).toDF(['key'])
mapping = {
    'A': 'S', 'B': 'S', 'C': 'S', 'DS': 'S', 'DNS': 'S', 
    'E': 'NS', 'F': 'NS', 'G': 'NS', 'H': 'NS'}

df.withColumn("value", translate(mapping)("key"))



## https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframe
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in list(set(df.columns)-set(['date'])) ]


pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(df).transform(df)


# https://stackoverflow.com/questions/36122559/how-to-map-variable-names-to-features-after-pipeline
val transformedDF = pipelineModel.transform(df)
val meta: org.apache.spark.sql.types.Metadata = transformedDF
  .schema(transformedDF.schema.fieldIndex("features"))
  .metadata
meta.getMetadata("ml_attr").getMetadata("attrs")
//  org.apache.spark.sql.types.Metadata = {"binary":[
//    {"idx":0,"name":"e"},{"idx":1,"name":"f"},{"idx":2,"name":"a"},
//    {"idx":3,"name":"b"},{"idx":4,"name":"c"}]}


# Getting labels from StringIndexer stages within pipeline in Spark (pyspark)
# https://stackoverflow.com/questions/45885044/getting-labels-from-stringindexer-stages-within-pipeline-in-spark-pyspark
from pyspark.ml.feature import StringIndexer, StringIndexerModel

df = spark.createDataFrame([("a", "foo"), ("b", "bar")], ("x1", "x2"))

pipeline = Pipeline(stages=[
    StringIndexer(inputCol=c, outputCol='{}_index'.format(c))
    for c in df.columns
])

model = pipeline.fit(df)

# Accessing _java_obj shouldn't be necessary in Spark 2.3+
{x._java_obj.getOutputCol(): x.labels for x in model.stages if isinstance(x, StringIndexerModel)}

indexed = model.transform(df)
{c.name: c.metadata["ml_attr"]["vals"]
for c in indexed.schema.fields if c.name.endswith("_index")}


# Pyspark ML - How to save pipeline and RandomForestClassificationModel
# https://stackoverflow.com/questions/44981407/pyspark-ml-how-to-save-pipeline-and-randomforestclassificationmodel
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
labels = label_indexer.fit(data).labels

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

early_stages = [label_indexer, feature_indexer]

# Split the data into training and test sets (30% held out for testing)
(train, test) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=early_stages + [rf, label_converter])

# Train model. This also runs the indexers.
model = pipeline.fit(train)

model.save("/tmp/rf")

>>> rf_model = model.stages[2]
>>> print(rf_model)
RandomForestClassificationModel (uid=rfc_b368678f4122) with 10 trees
>>> rf_model.save("/tmp/rf_2")



# Split Spark Dataframe string column into multiple columns
# https://stackoverflow.com/questions/39235704/split-spark-dataframe-string-column-into-multiple-columns
split_col = pyspark.sql.functions.split(df['my_str_col'], '-')
df = df.withColumn('NAME1', split_col.getItem(0))
df = df.withColumn('NAME2', split_col.getItem(1))



# Select column name per row for max value in PySpark
data = [(("ID1", 3, 5,78)), (("ID2", 4, 12,45)), (("ID3", 70, 3,67))]
df = spark.createDataFrame(data, ["ID", "colA", "colB","colC"])
df.show()

cols = df.columns

# to get max of values in a row
maxcol = F.udf(lambda row: max(row), IntegerType())
maxDF = df.withColumn("maxval", maxcol(F.struct([df[x] for x in df.columns[1:]])))
maxDF.show()