pyspark.sql.DataFrame.filter
PySpark filter function is used to filter the data in a Spark Data Frame, in short used to cleansing of data. Once filter is applied, we will get the dataframe with filtered data only. filter is applied on Data Frame with multiple conditions.
Reference
Sample Code
from pyspark.sql import SparkSession from pyspark.sql.functions import col spark = SparkSession.builder.appName("Filter-Sample @ Freshers.in ").getOrCreate() emp_schema= ["name","id","age","gender","dept","state","salary","increment"] emp_data = [("Sam",1010,30,"Male","IT","CT",125000,8.1), ("Peter",1020,32,"Male","HR","NY",90000,7.0), ("Tom",1030,34,"Male","IT","CT",85000,5.0), ("Joe",1040,55,"Male","HR","GA",60000,4.0), ("Barry",1050,34,"Male","IT","OR",95000,6.0), ("Mike",1060,42,"Male","SALES","OR",70000,4.0), ("Suzy",1070,34,"Female","IT","GA",99000,6.0), ("Annie",1080,54,"Female","SALES","GA",80000,3.0), ("Bush",1090,34,"Male","IT","AL",85000,5.0), ("Evan",1100,44,"Female","CRM","GA",75000,8.0)]
df = spark.createDataFrame(data=emp_data,schema=emp_schema) df.show() +-----+----+---+------+-----+-----+------+---------+ | name| id|age|gender| dept|state|salary|increment| +-----+----+---+------+-----+-----+------+---------+ | Sam|1010| 30| Male| IT| CT|125000| 8.1| |Peter|1020| 32| Male| HR| NY| 90000| 7.0| | Tom|1030| 34| Male| IT| CT| 85000| 5.0| | Joe|1040| 55| Male| HR| GA| 60000| 4.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| | Mike|1060| 42| Male|SALES| OR| 70000| 4.0| | Suzy|1070| 34|Female| IT| GA| 99000| 6.0| |Annie|1080| 54|Female|SALES| GA| 80000| 3.0| | Bush|1090| 34| Male| IT| AL| 85000| 5.0| | Evan|1100| 44|Female| CRM| GA| 75000| 8.0| +-----+----+---+------+-----+-----+------+---------+
#Pyspark filter by using & operator
filter_by_sal1_a = df.filter((df.salary > 85000) & (df.salary < 99000)).show() +-----+----+---+------+----+-----+------+---------+ | name| id|age|gender|dept|state|salary|increment| +-----+----+---+------+----+-----+------+---------+ |Peter|1020| 32| Male| HR| NY| 90000| 7.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| +-----+----+---+------+----+-----+------+---------+
##Pyspark filter same as above but different syntax
filter_by_sal1_b = df.filter("salary > 85000 and salary < 99000").show() +-----+----+---+------+----+-----+------+---------+ | name| id|age|gender|dept|state|salary|increment| +-----+----+---+------+----+-----+------+---------+ |Peter|1020| 32| Male| HR| NY| 90000| 7.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| +-----+----+---+------+----+-----+------+---------+
#Pyspark filter by using “or” operator – Syntax – 1
filter_by_sal2 = df.filter((df.salary == 85000) | (df.salary == 99000)).show() +----+----+---+------+----+-----+------+---------+ |name| id|age|gender|dept|state|salary|increment| +----+----+---+------+----+-----+------+---------+ | Tom|1030| 34| Male| IT| CT| 85000| 5.0| |Suzy|1070| 34|Female| IT| GA| 99000| 6.0| |Bush|1090| 34| Male| IT| AL| 85000| 5.0| +----+----+---+------+----+-----+------+---------+
#Pyspark filter by using “or” and ‘col’ operator – Syntax – 2
filter_by_sal3_a = df.filter((col("salary") == 85000) | (col("salary") == 99000)).show() +----+----+---+------+----+-----+------+---------+ |name| id|age|gender|dept|state|salary|increment| +----+----+---+------+----+-----+------+---------+ | Tom|1030| 34| Male| IT| CT| 85000| 5.0| |Suzy|1070| 34|Female| IT| GA| 99000| 6.0| |Bush|1090| 34| Male| IT| AL| 85000| 5.0| +----+----+---+------+----+-----+------+---------+
#Pyspark filter by using the expression in double quotes
filter_by_sal3_b = df.filter("salary == 85000 or salary == 99000").show() +----+----+---+------+----+-----+------+---------+ |name| id|age|gender|dept|state|salary|increment| +----+----+---+------+----+-----+------+---------+ | Tom|1030| 34| Male| IT| CT| 85000| 5.0| |Suzy|1070| 34|Female| IT| GA| 99000| 6.0| |Bush|1090| 34| Male| IT| AL| 85000| 5.0| +----+----+---+------+----+-----+------+---------+
#Pyspark filter by using “between” operator
filter_by_sal4_a = df.filter(df.salary.between(85000,99000)).show() +-----+----+---+------+----+-----+------+---------+ | name| id|age|gender|dept|state|salary|increment| +-----+----+---+------+----+-----+------+---------+ |Peter|1020| 32| Male| HR| NY| 90000| 7.0| | Tom|1030| 34| Male| IT| CT| 85000| 5.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| | Suzy|1070| 34|Female| IT| GA| 99000| 6.0| | Bush|1090| 34| Male| IT| AL| 85000| 5.0| +-----+----+---+------+----+-----+------+---------+
#Pyspark filter by using between and orderBy desc operator
filter_by_sal4_b = df.filter(col("salary").between(85000,99000)).orderBy(df.salary.desc()).show() +-----+----+---+------+----+-----+------+---------+ | name| id|age|gender|dept|state|salary|increment| +-----+----+---+------+----+-----+------+---------+ | Suzy|1070| 34|Female| IT| GA| 99000| 6.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| |Peter|1020| 32| Male| HR| NY| 90000| 7.0| | Bush|1090| 34| Male| IT| AL| 85000| 5.0| |Tom|1030| 34| Male| IT| CT| 85000| 5.0| +-----+----+---+------+----+-----+------+---------+
#Pyspark filter by using between and orderBy asc operator
filter_by_sal4_b = df.filter(col("salary").between(85000,99000)).orderBy(df.salary.asc()).show() +-----+----+---+------+----+-----+------+---------+ | name| id|age|gender|dept|state|salary|increment| +-----+----+---+------+----+-----+------+---------+ |Bush|1090| 34| Male| IT| AL| 85000| 5.0| |Tom|1030| 34| Male| IT| CT| 85000| 5.0| |Peter|1020| 32| Male| HR| NY| 90000| 7.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| |Suzy|1070| 34|Female| IT| GA| 99000| 6.0| +-----+----+---+------+----+-----+------+---------+
#Pyspark filter by using multiple conditon
filter_by_sal_dept = df.filter(df.salary.between(85000,99000) & (df.dept == 'IT')).show() +-----+----+---+------+----+-----+------+---------+ | name| id|age|gender|dept|state|salary|increment| +-----+----+---+------+----+-----+------+---------+ | Tom|1030| 34| Male| IT| CT| 85000| 5.0| |Barry|1050| 34| Male| IT| OR| 95000| 6.0| | Suzy|1070| 34|Female| IT| GA| 99000| 6.0| | Bush|1090| 34| Male| IT| AL| 85000| 5.0| +-----+----+---+------+----+-----+------+---------+
Reference
External Link
Filter() Spark page