filter / where
The filter condition will filters rows based on multiple conditions. where() is an alias for filter(). In the below example you will see how to apply a filter and where on DataFrame columns.
Syntax : DataFrame.filter(condition) DataFrame.where(condition)
#Sample Code
from pyspark.sql import SparkSession
from pyspark.sql.types import MapType,ArrayType
from pyspark.sql.types import StructType,StructField, StringType
from pyspark.sql.functions import array_contains
#Input data
emp_data =[
("USA",("Sam","James","John"),("English","Spanish"),"Bachelors","Business"),
("UK",("Michael","David","William"),("English","Spanish","French"),"Masters","Business"),
("CHINA",("Richard","","John"),("English","Spanish","Chineese"),"Masters","Salaried"),
("INDIA",("Thomas","Christopher","Daniel"),("English","Spanish","Hindi"),"Bachelors","Salaried"),
("USA",("Donald","Kenneth","Kevin"),("English","Spanish","German"),"Masters","Business"),
]
#Defining Schema of the data
emp_data_schema = StructType([
StructField("country",StringType(),True),
StructField("name",StructType([
StructField("first_name",StringType(),True),
StructField("middle_name",StringType(),True),
StructField("last_name",StringType(),True)])),
StructField("known_language",ArrayType(StringType(),True)),
StructField("qualification",StringType(),True),
StructField("job_profile",StringType(),True)
])
#Creating the dataframe from data and schema above
emp_df = spark.createDataFrame(data=emp_data,schema=emp_data_schema)
emp_df.printSchema()
"""
root
|-- country: string (nullable = true)
|-- name: struct (nullable = true)
| |-- first_name: string (nullable = true)
| |-- middle_name: string (nullable = true)
| |-- last_name: string (nullable = true)
|-- known_language: array (nullable = true)
| |-- element: string (containsNull = true)
|-- qualification: string (nullable = true)
|-- job_profile: string (nullable = true)
"""
emp_df.show(20,False)
"""
+-------+-----------------------------+----------------------------+-------------+-----------+
|country|name |known_language |qualification|job_profile|
+-------+-----------------------------+----------------------------+-------------+-----------+
|USA |[Sam, James, John] |[English, Spanish] |Bachelors |Business |
|UK |[Michael, David, William] |[English, Spanish, French] |Masters |Business |
|CHINA |[Richard, , John] |[English, Spanish, Chineese]|Masters |Salaried |
|INDIA |[Thomas, Christopher, Daniel]|[English, Spanish, Hindi] |Bachelors |Salaried |
|USA |[Donald, Kenneth, Kevin] |[English, Spanish, German] |Masters |Business |
+-------+-----------------------------+----------------------------+-------------+-----------+
"""
#Where job_profile = "Business"
#filtering the data with filter condition
emp_df.filter(emp_df.job_profile == "Business").show(20,False)
"""
+-------+-------------------------+--------------------------+-------------+-----------+
|country|name |known_language |qualification|job_profile|
+-------+-------------------------+--------------------------+-------------+-----------+
|USA |[Sam, James, John] |[English, Spanish] |Bachelors |Business |
|UK |[Michael, David, William]|[English, Spanish, French]|Masters |Business |
|USA |[Donald, Kenneth, Kevin] |[English, Spanish, German]|Masters |Business |
+-------+-------------------------+--------------------------+-------------+-----------+
"""
emp_df.filter((emp_df.job_profile == "Business") | (emp_df.qualification == "Masters")).show(20,False)
"""
+-------+-------------------------+----------------------------+-------------+-----------+
|country|name |known_language |qualification|job_profile|
+-------+-------------------------+----------------------------+-------------+-----------+
|USA |[Sam, James, John] |[English, Spanish] |Bachelors |Business |
|UK |[Michael, David, William]|[English, Spanish, French] |Masters |Business |
|CHINA |[Richard, , John] |[English, Spanish, Chineese]|Masters |Salaried |
|USA |[Donald, Kenneth, Kevin] |[English, Spanish, German] |Masters |Business |
+-------+-------------------------+----------------------------+-------------+-----------+
"""
emp_df.filter((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters")).show(20,False)
emp_df.where((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters")).show(20,False)
"""
+-------+-------------------------+--------------------------+-------------+-----------+
|country|name |known_language |qualification|job_profile|
+-------+-------------------------+--------------------------+-------------+-----------+
|UK |[Michael, David, William]|[English, Spanish, French]|Masters |Business |
|USA |[Donald, Kenneth, Kevin] |[English, Spanish, German]|Masters |Business |
+-------+-------------------------+--------------------------+-------------+-----------+
"""
emp_df.filter((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters") & array_contains(emp_df.known_language,"German")).show(20,False)
emp_df.where((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters") & array_contains(emp_df.known_language,"German")).show(20,False)
"""
+-------+------------------------+--------------------------+-------------+-----------+
|country|name |known_language |qualification|job_profile|
+-------+------------------------+--------------------------+-------------+-----------+
|USA |[Donald, Kenneth, Kevin]|[English, Spanish, German]|Masters |Business |
+-------+------------------------+--------------------------+-------------+-----------+
"""
emp_df.filter(array_contains(emp_df.known_language,"French") | array_contains(emp_df.known_language,"German")).show(20,False)
emp_df.where(array_contains(emp_df.known_language,"French") | array_contains(emp_df.known_language,"German")).show(20,False)
"""
+-------+-------------------------+--------------------------+-------------+-----------+
|country|name |known_language |qualification|job_profile|
+-------+-------------------------+--------------------------+-------------+-----------+
|UK |[Michael, David, William]|[English, Spanish, French]|Masters |Business |
|USA |[Donald, Kenneth, Kevin] |[English, Spanish, German]|Masters |Business |
+-------+-------------------------+--------------------------+-------------+-----------+
"""
Reference