Use case : Converting Map to multiple columns. There can be raw data with Maptype with multiple key value pair. To do the analysis we need to have each key to be in each column. PySpark supports multiple map functions to get the keys and values of the map columns. Here we will show the example of how to convert the Key value of Maptype to multiple columns.
Sample Code
from pyspark.sql import SparkSession
from pyspark.sql.types import MapType
from pyspark.sql.types import StructType,StructField
from pyspark.sql.types import IntegerType,StringType
from pyspark.sql.functions import col
student_data =[
("Princeton",{"name":"Mike John","age":21,"country":"USA"},101),
("Harvard",{"name":"Jim Morgan","age":20,"country":"UK"},102),
("MIT",{"name":"Wincent Tim","age":22,"country":"INDIA"},103),
("Princeton",{"name":"Doyle Cracker","age":21,"country":"USA"},103),
("Princeton",{"name":"Maria Bob","country":"USA"},103),
("Harvard",{"name":"Nancy Agre","age":21,"country":"USA"},102),
("Harvard",{"name":"Mathew JR.III","age":21},102),
("BITS",{"name":"Philip Walters","age":22,"country":"GERMANY"},104),
("Harvard",{"name":"Alicia Berg","age":21,"country":"USA"},101),
("MIT",{"name":"Wilcow Houdy","age":21},102),]
student_data_schema = StructType([
StructField("stu_ucty",StringType(),True),
StructField("students",MapType(StringType(),StringType()),True),
StructField("class_id",IntegerType(),True),])
student_df = spark.createDataFrame(data=student_data,schema=student_data_schema)
student_df.printSchema()
root
|-- stu_ucty: string (nullable = true)
|-- students: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- class_id: integer (nullable = true)
student_df.show(20,False)
+---------+-------------------------------------------------------+--------+
|stu_ucty |students |class_id|
+---------+-------------------------------------------------------+--------+
|Princeton|[name -> Mike John, country -> USA, age -> 21] |101 |
|Harvard |[name -> Jim Morgan, country -> UK, age -> 20] |102 |
|MIT |[name -> Wincent Tim, country -> INDIA, age -> 22] |103 |
|Princeton|[name -> Doyle Cracker, country -> USA, age -> 21] |103 |
|Princeton|[name -> Maria Bob, country -> USA] |103 |
|Harvard |[name -> Nancy Agre, country -> USA, age -> 21] |102 |
|Harvard |[name -> Mathew JR.III, age -> 21] |102 |
|BITS |[name -> Philip Walters, country -> GERMANY, age -> 22]|104 |
|Harvard |[name -> Alicia Berg, country -> USA, age -> 21] |101 |
|MIT |[name -> Wilcow Houdy, age -> 21] |102 |
+---------+-------------------------------------------------------+--------+
student_df2 = student_df.select(col("stu_ucty").alias("University"),col("students").getItem("name").alias("Student_Name"),col("students").getItem("age").alias("Student_Age"),col("students").getItem("country").alias("Student_Country"))
student_df2.printSchema()
student_df2.show(20,False)
root
|-- University: string (nullable = true)
|-- Student_Name: string (nullable = true)
|-- Student_Age: string (nullable = true)
|-- Student_Country: string (nullable = true)
+----------+--------------+-----------+---------------+
|University|Student_Name |Student_Age|Student_Country|
+----------+--------------+-----------+---------------+
|Princeton |Mike John |21 |USA |
|Harvard |Jim Morgan |20 |UK |
|MIT |Wincent Tim |22 |INDIA |
|Princeton |Doyle Cracker |21 |USA |
|Princeton |Maria Bob |null |USA |
|Harvard |Nancy Agre |21 |USA |
|Harvard |Mathew JR.III |21 |null |
|BITS |Philip Walters|22 |GERMANY |
|Harvard |Alicia Berg |21 |USA |
|MIT |Wilcow Houdy |21 |null |
+----------+--------------+-----------+---------------+
Note : You can see that in Map if there is no Key in a record the value will be default set to Null.
Reference