How to use filter or where condition in PySpark

PySpark @ Freshers.in

filter / where

The filter condition will filters rows based on multiple conditions. where() is an alias for filter(). In the below example you will see how to apply a filter and where on DataFrame columns.

Syntax : 
DataFrame.filter(condition)
DataFrame.where(condition)
#Sample Code
from pyspark.sql import SparkSession
from pyspark.sql.types import MapType,ArrayType
from pyspark.sql.types import StructType,StructField, StringType
from pyspark.sql.functions import array_contains
#Input data
emp_data =[
("USA",("Sam","James","John"),("English","Spanish"),"Bachelors","Business"),
("UK",("Michael","David","William"),("English","Spanish","French"),"Masters","Business"),
("CHINA",("Richard","","John"),("English","Spanish","Chineese"),"Masters","Salaried"),
("INDIA",("Thomas","Christopher","Daniel"),("English","Spanish","Hindi"),"Bachelors","Salaried"),
("USA",("Donald","Kenneth","Kevin"),("English","Spanish","German"),"Masters","Business"),
]
#Defining Schema of the data
emp_data_schema = StructType([
StructField("country",StringType(),True),
StructField("name",StructType([
StructField("first_name",StringType(),True),
StructField("middle_name",StringType(),True),
StructField("last_name",StringType(),True)])),
StructField("known_language",ArrayType(StringType(),True)),
StructField("qualification",StringType(),True), 
StructField("job_profile",StringType(),True)
])
#Creating the dataframe from data and schema above
emp_df = spark.createDataFrame(data=emp_data,schema=emp_data_schema)
emp_df.printSchema()
"""
root
 |-- country: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- known_language: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qualification: string (nullable = true)
 |-- job_profile: string (nullable = true)
""" 
emp_df.show(20,False)
"""
+-------+-----------------------------+----------------------------+-------------+-----------+
|country|name                         |known_language              |qualification|job_profile|
+-------+-----------------------------+----------------------------+-------------+-----------+
|USA    |[Sam, James, John]           |[English, Spanish]          |Bachelors    |Business   |
|UK     |[Michael, David, William]    |[English, Spanish, French]  |Masters      |Business   |
|CHINA  |[Richard, , John]            |[English, Spanish, Chineese]|Masters      |Salaried   |
|INDIA  |[Thomas, Christopher, Daniel]|[English, Spanish, Hindi]   |Bachelors    |Salaried   |
|USA    |[Donald, Kenneth, Kevin]     |[English, Spanish, German]  |Masters      |Business   |
+-------+-----------------------------+----------------------------+-------------+-----------+
"""
#Where job_profile = "Business"
#filtering the data with filter condition
emp_df.filter(emp_df.job_profile == "Business").show(20,False)
"""
+-------+-------------------------+--------------------------+-------------+-----------+
|country|name                     |known_language            |qualification|job_profile|
+-------+-------------------------+--------------------------+-------------+-----------+
|USA    |[Sam, James, John]       |[English, Spanish]        |Bachelors    |Business   |
|UK     |[Michael, David, William]|[English, Spanish, French]|Masters      |Business   |
|USA    |[Donald, Kenneth, Kevin] |[English, Spanish, German]|Masters      |Business   |
+-------+-------------------------+--------------------------+-------------+-----------+
"""
emp_df.filter((emp_df.job_profile == "Business") | (emp_df.qualification == "Masters")).show(20,False)
"""
+-------+-------------------------+----------------------------+-------------+-----------+
|country|name                     |known_language              |qualification|job_profile|
+-------+-------------------------+----------------------------+-------------+-----------+
|USA    |[Sam, James, John]       |[English, Spanish]          |Bachelors    |Business   |
|UK     |[Michael, David, William]|[English, Spanish, French]  |Masters      |Business   |
|CHINA  |[Richard, , John]        |[English, Spanish, Chineese]|Masters      |Salaried   |
|USA    |[Donald, Kenneth, Kevin] |[English, Spanish, German]  |Masters      |Business   |
+-------+-------------------------+----------------------------+-------------+-----------+
"""
emp_df.filter((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters")).show(20,False)
emp_df.where((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters")).show(20,False)
"""
+-------+-------------------------+--------------------------+-------------+-----------+
|country|name                     |known_language            |qualification|job_profile|
+-------+-------------------------+--------------------------+-------------+-----------+
|UK     |[Michael, David, William]|[English, Spanish, French]|Masters      |Business   |
|USA    |[Donald, Kenneth, Kevin] |[English, Spanish, German]|Masters      |Business   |
+-------+-------------------------+--------------------------+-------------+-----------+
"""
emp_df.filter((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters") & array_contains(emp_df.known_language,"German")).show(20,False)
emp_df.where((emp_df.job_profile == "Business") & (emp_df.qualification == "Masters") & array_contains(emp_df.known_language,"German")).show(20,False)

"""
+-------+------------------------+--------------------------+-------------+-----------+
|country|name                    |known_language            |qualification|job_profile|
+-------+------------------------+--------------------------+-------------+-----------+
|USA    |[Donald, Kenneth, Kevin]|[English, Spanish, German]|Masters      |Business   |
+-------+------------------------+--------------------------+-------------+-----------+
"""
emp_df.filter(array_contains(emp_df.known_language,"French") | array_contains(emp_df.known_language,"German")).show(20,False)
emp_df.where(array_contains(emp_df.known_language,"French") | array_contains(emp_df.known_language,"German")).show(20,False)
"""
+-------+-------------------------+--------------------------+-------------+-----------+
|country|name                     |known_language            |qualification|job_profile|
+-------+-------------------------+--------------------------+-------------+-----------+
|UK     |[Michael, David, William]|[English, Spanish, French]|Masters      |Business   |
|USA    |[Donald, Kenneth, Kevin] |[English, Spanish, German]|Masters      |Business   |
+-------+-------------------------+--------------------------+-------------+-----------+
"""

Reference

  1. Spark Examples
  2. PySpark Blogs
  3. Bigdata Blogs
  4. Spark Interview Questions
  5. Official Page
Author: user

Leave a Reply