A cube is a multi-dimensional generalization of a two- or three-dimensional spreadsheet. Cube is a shorthand for multidimensional dataset, given that data can have an arbitrary number of dimensions. cube function takes a list of column names and will return all possible combinations of grouping the columns after applying aggregate functions. We can use aggregation function for cube. Here I am having a dataframe with two rows and five columns. When I did cube mentioning all the 5 column , I got all the combination of 5 columns
>>> row = [("Name1","Country1","City1","Age1","Pincode1"),("Name2","Country2","City2","Age2","Pincode2")] >>> schema=["Name","Country","City","Age","Pincode"] >>> df=spark.createDataFrame(row,schema) >>> df.show() +-----+--------+-----+----+--------+ | Name| Country| City| Age| Pincode| +-----+--------+-----+----+--------+ |Name1|Country1|City1|Age1|Pincode1| |Name2|Country2|City2|Age2|Pincode2| +-----+--------+-----+----+--------+ >>> df.cube(df["Name"],df["Country"],df["City"],df["Age"],df["Pincode"]).count().show(100,False) +-----+--------+-----+----+--------+-----+ |Name |Country |City |Age |Pincode |count| +-----+--------+-----+----+--------+-----+ |null |Country1|null |null|Pincode1|1 | |null |null |null |null|null |2 | |null |Country1|null |null|null |1 | |null |Country1|null |Age1|Pincode1|1 | |Name2|null |null |null|Pincode2|1 | |Name2|null |City2|Age2|null |1 | |null |null |City1|Age1|null |1 | |null |Country2|City2|Age2|Pincode2|1 | |null |null |City1|null|null |1 | |Name1|null |null |Age1|Pincode1|1 | |Name1|Country1|null |null|Pincode1|1 | |null |null |null |Age1|null |1 | |null |Country2|null |null|Pincode2|1 | |Name1|null |null |null|Pincode1|1 | |null |Country1|City1|null|null |1 | |null |Country2|null |Age2|Pincode2|1 | |null |null |City2|Age2|Pincode2|1 | |null |null |City2|null|Pincode2|1 | |Name1|null |City1|null|Pincode1|1 | |Name1|Country1|City1|Age1|Pincode1|1 | |Name2|null |null |Age2|null |1 | |null |null |City1|null|Pincode1|1 | |Name1|null |null |Age1|null |1 | |Name2|Country2|null |null|Pincode2|1 | |null |null |City1|Age1|Pincode1|1 | |Name2|Country2|null |null|null |1 | |null |null |null |Age2|null |1 | |null |Country2|City2|Age2|null |1 | |null |Country1|City1|null|Pincode1|1 | |Name1|null |City1|null|null |1 | |null |Country1|City1|Age1|null |1 | |null |Country1|City1|Age1|Pincode1|1 | |Name1|null |City1|Age1|null |1 | |null |Country2|City2|null|Pincode2|1 | |Name1|Country1|City1|Age1|null |1 | |Name2|null |null |null|null |1 | |Name1|null |City1|Age1|Pincode1|1 | |null |null |null |null|Pincode1|1 | |null |null |City2|Age2|null |1 | |Name1|Country1|null |Age1|Pincode1|1 | |null |null |null |Age1|Pincode1|1 | |Name2|null |City2|null|null |1 | |Name2|Country2|City2|null|Pincode2|1 | |null |null |null |null|Pincode2|1 | |Name2|null |City2|Age2|Pincode2|1 | |Name2|Country2|City2|Age2|null |1 | |null |null |null |Age2|Pincode2|1 | |Name1|Country1|City1|null|Pincode1|1 | |null |Country2|City2|null|null |1 | |Name1|Country1|null |Age1|null |1 | |Name2|Country2|City2|null|null |1 | |Name2|Country2|City2|Age2|Pincode2|1 | |Name1|Country1|null |null|null |1 | |null |Country1|null |Age1|null |1 | |Name2|null |null |Age2|Pincode2|1 | |Name2|null |City2|null|Pincode2|1 | |Name2|Country2|null |Age2|null |1 | |null |null |City2|null|null |1 | |null |Country2|null |Age2|null |1 | |Name1|null |null |null|null |1 | |Name2|Country2|null |Age2|Pincode2|1 | |null |Country2|null |null|null |1 | |Name1|Country1|City1|null|null |1 | +-----+--------+-----+----+--------+-----+
Example 2
>>> df = spark.createDataFrame([("Tom", 400,50, "Teacher","IND"),("Jack", 420,60, "Finance","USA"),("Brack", 500,10, "Teacher","IND"),("Jim", 700,80, "Finance","JAPAN")],("name", "salary","cnt", "department","country")) >>> df.show() +-----+------+---+----------+-------+ | name|salary|cnt|department|country| +-----+------+---+----------+-------+ | Tom| 400| 50| Teacher| IND| | Jack| 420| 60| Finance| USA| |Brack| 500| 10| Teacher| IND| | Jim| 700| 80| Finance| JAPAN| +-----+------+---+----------+-------+ >>> df.cube(df["country"],df["department"]).sum().show(50,False) +-------+----------+-----------+--------+ |country|department|sum(salary)|sum(cnt)| +-------+----------+-----------+--------+ |JAPAN |null |700 |80 | |null |null |2020 |200 | |USA |null |420 |60 | |null |Finance |1120 |140 | |IND |Teacher |900 |60 | |null |Teacher |900 |60 | |JAPAN |Finance |700 |80 | |USA |Finance |420 |60 | |IND |null |900 |60 | +-------+----------+-----------+--------+ >>> df.cube(df["country"],df["department"]).count().show(50,False) +-------+----------+-----+ |country|department|count| +-------+----------+-----+ |JAPAN |null |1 | |null |null |4 | |USA |null |1 | |null |Finance |2 | |IND |Teacher |2 | |null |Teacher |2 | |JAPAN |Finance |1 | |USA |Finance |1 | |IND |null |2 | +-------+----------+-----+ >>> df.cube(df["country"],df["department"]).min().show(50,False) +-------+----------+-----------+--------+ |country|department|min(salary)|min(cnt)| +-------+----------+-----------+--------+ |JAPAN |null |700 |80 | |null |null |400 |10 | |USA |null |420 |60 | |null |Finance |420 |60 | |IND |Teacher |400 |10 | |null |Teacher |400 |10 | |JAPAN |Finance |700 |80 | |USA |Finance |420 |60 | |IND |null |400 |10 | +-------+----------+-----------+--------+ >>> df.cube(df["department"]).min().show(50,False) +----------+-----------+--------+ |department|min(salary)|min(cnt)| +----------+-----------+--------+ |null |400 |10 | |Teacher |400 |10 | |Finance |420 |60 | +----------+-----------+--------+ >>> df.cube(df["country"],df["department"]).max().show(50,False) +-------+----------+-----------+--------+ |country|department|max(salary)|max(cnt)| +-------+----------+-----------+--------+ |JAPAN |null |700 |80 | |null |null |700 |80 | |USA |null |420 |60 | |null |Finance |700 |80 | |IND |Teacher |500 |50 | |null |Teacher |500 |50 | |JAPAN |Finance |700 |80 | |USA |Finance |420 |60 | |IND |null |500 |50 | +-------+----------+-----------+--------+
>>> df.cube(df["country"],df["department"]).max().sort("country").show(50,False) +-------+----------+-----------+--------+ |country|department|max(salary)|max(cnt)| +-------+----------+-----------+--------+ |null |null |700 |80 | |null |Teacher |500 |50 | |null |Finance |700 |80 | |IND |Teacher |500 |50 | |IND |null |500 |50 | |JAPAN |null |700 |80 | |JAPAN |Finance |700 |80 | |USA |null |420 |60 | |USA |Finance |420 |60 | +-------+----------+-----------+--------+