Feature Selection with PySpark

from pyspark.ml.feature import VectorSlicervs= VectorSlicer(inputCol= “features”, outputCol=”sliced”, indices=[1,4])output= vs.transform(df)output.select(‘userFeatures’, ‘features’).show()
from pyspark.ml.feature import RFormulaformula=RFormula(formula= “clicked ~ country+ hour”, featuresCol= “features”, labelCol= “label”)output = formula.fit(dataset).transform(dataset)output.select(“features”, “label”).show()
from pyspark.ml.feature import ChiSqSelectorselector=ChiSqSelector(percentile=0.9, featuresCol=”features”, outputCol=’selectedFeatures’, labelCol= “label”)model=selector.fit(train)
result = model.transform(train)
train =result.select('label','selectedFeatures').withColumnRenamed('selectedFeatures', 'features')
new_test=model.transform(test)
test=new_test.select('label','selectedFeatures').withColumnRenamed('selectedFeatures', 'features')

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store