python爬网站数据变压器_PySpark管道中具有交叉验证的自定义变压器

我编写了一个自定义的转换器,就像描述的here。在

当用我的转换器创建一个管道时,我可以训练一个(逻辑回归)模型进行分类。在

但是,当我想用这样的管道执行交叉验证时:from pyspark.ml.feature import HashingTF

from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression

from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

sentenceDataFrame = sqlContext.createDataFrame([

(1.0, "Hi I heard about Spark"),

(1.0, "Spark is awesome"),

(0.0, "But there seems to be a problem"),

(0.0, "And I don't know why...")

], ["label", "sentence"])

tokenizer = NLTKWordPunctTokenizer(

inputCol="sentence", outputCol="words",

stopwords=set(nltk.corpus.stopwords.words('english')))

hasher = HashingTF(inputCol="words",outputCol="features")

lr = LogisticRegression()

pipeline = Pipeline(stages=[tokenizer,hasher,lr])

paramGrid = ParamGridBuilder().addGrid(lr.regParam, (0.01, 0.1))\

.addGrid(lr.tol, (1e-5, 1e-6))\

.build()

cv = CrossValidator(estimator=pipeline,

estimatorParamMaps=paramGrid,

evaluator=BinaryClassificationEvaluator(),

numFolds=4)

model = cv.fit(sentenceDataFrame)

我得到以下错误:

^{pr2}$

以及Python stacktrace:Py4JJavaError Traceback (most recent call last)

in ()

22 numFolds=4)

23

---> 24 model = cv.fit(sentenceDataFrame)

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/pipeline.pyc in fit(self, dataset, params)

63 return self.copy(params)._fit(dataset)

64 else:

---> 65 return self._fit(dataset)

66 else:

67 raise ValueError("Params must be either a param map or a list/tuple of param maps, "

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/tuning.pyc in _fit(self, dataset)

220 train = df.filter(~condition)

221 for j in range(numModels):

--> 222 model = est.fit(train, epm[j])

223 # TODO: duplicate evaluator to take extra params from input

224 metric = eva.evaluate(model.transform(validation, epm[j]))

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/pipeline.pyc in fit(self, dataset, params)

61 elif isinstance(params, dict):

62 if params:

---> 63 return self.copy(params)._fit(dataset)

64 else:

65 return self._fit(dataset)

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/pipeline.pyc in _fit(self, dataset)

196 dataset = stage.transform(dataset)

197 else: # must be an Estimator

--> 198 model = stage.fit(dataset)

199 transformers.append(model)

200 if i < indexOfLastEstimator:

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/pipeline.pyc in fit(self, dataset, params)

63 return self.copy(params)._fit(dataset)

64 else:

---> 65 return self._fit(dataset)

66 else:

67 raise ValueError("Params must be either a param map or a list/tuple of param maps, "

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit(self, dataset)

129

130 def _fit(self, dataset):

--> 131 java_model = self._fit_java(dataset)

132 return self._create_model(java_model)

133

~/spark-1.4.1-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit_java(self, dataset)

126 """

127 self._transfer_params_to_java()

--> 128 return self._java_obj.fit(dataset._jdf)

129

130 def _fit(self, dataset):

~/spark-1.4.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)

536 answer = self.gateway_client.send_command(command)

537 return_value = get_return_value(answer, self.gateway_client,

--> 538 self.target_id, self.name)

539

540 for temp_arg in temp_args:

~/spark-1.4.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)

298 raise Py4JJavaError(

299 'An error occurred while calling {0}{1}{2}.\n'.

--> 300 format(target_id, '.', name), value)

301 else:

302 raise Py4JError(

我通过预先转换数据帧来解决这个错误,即将我的变压器移出管道。

但我真的希望将所有步骤都保留在处理管道中,这样我就可以在不需要任何前面步骤的情况下使用它来对看不到的数据进行分类,还可以调整特征提取参数。所以任何帮助都是感激的。在

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值