In [1]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)
c:\Users\andre\.conda\envs\pyspark\python.exe
3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:34:57) [MSC v.1936 64 bit (AMD64)]
sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)
In [2]:
import os
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import col, lit
In [4]:
spark = SparkSession.builder \
                    .master("local[1]") \
                    .appName("Intro") \
                    .getOrCreate()

Beispieldatensatz anlegen¶

In [5]:
data = [
    ('France', '01_23', 'Hardware', 120),
    ('France', '02_23', 'Hardware', 90),
    ('France', '03_23', 'Hardware', 140),
    ('France', '01_23', 'Accessories', 240),
    ('France', '02_23', 'Accessories', 360),
    ('France', '03_23', 'Accessories', 200),
    ('Italy',  '01_23', 'Hardware', 130),
    ('Italy',  '02_23', 'Hardware', 70),
    ('Italy',  '03_23', 'Hardware',  150),
    ('Italy',  '01_23', 'Accessories', 260),
    ('Italy',  '02_23', 'Accessories',  340),
    ('Italy',  '03_23', 'Accessories',  300),
    ('USA',  '01_23', 'Accessories',  280),
    ('USA',  '02_23', 'Accessories',  300),
    ('USA',  '03_23', 'Accessories',  320),
    ('Japan',  '01_23', 'Hardware',  180),
    ('Japan',  '02_23', 'Hardware',  100),
    ('Japan',  '03_23', 'Hardware',  120),


]
In [6]:
columns = ['country', 'month', 'category', 'sales' ]
In [7]:
df = spark.createDataFrame(data = data,
                           schema= columns)
In [8]:
df.printSchema()
root
 |-- country: string (nullable = true)
 |-- month: string (nullable = true)
 |-- category: string (nullable = true)
 |-- sales: long (nullable = true)

In [9]:
df.show()
+-------+-----+-----------+-----+
|country|month|   category|sales|
+-------+-----+-----------+-----+
| France|01_23|   Hardware|  120|
| France|02_23|   Hardware|   90|
| France|03_23|   Hardware|  140|
| France|01_23|Accessories|  240|
| France|02_23|Accessories|  360|
| France|03_23|Accessories|  200|
|  Italy|01_23|   Hardware|  130|
|  Italy|02_23|   Hardware|   70|
|  Italy|03_23|   Hardware|  150|
|  Italy|01_23|Accessories|  260|
|  Italy|02_23|Accessories|  340|
|  Italy|03_23|Accessories|  300|
|    USA|01_23|Accessories|  280|
|    USA|02_23|Accessories|  300|
|    USA|03_23|Accessories|  320|
|  Japan|01_23|   Hardware|  180|
|  Japan|02_23|   Hardware|  100|
|  Japan|03_23|   Hardware|  120|
+-------+-----+-----------+-----+

Explorative Datenanalyse¶

In [10]:
df.count()
Out[10]:
18
In [11]:
df.select("country").distinct().show()
+-------+
|country|
+-------+
| France|
|  Italy|
|    USA|
|  Japan|
+-------+

In [12]:
df.select("country").distinct().count()
Out[12]:
4
In [13]:
df.groupBy("country").agg(f.max("sales")).show()
+-------+----------+
|country|max(sales)|
+-------+----------+
| France|       360|
|  Italy|       340|
|    USA|       320|
|  Japan|       180|
+-------+----------+

In [14]:
df.groupBy("country").agg(f.avg("sales")).show()
+-------+------------------+
|country|        avg(sales)|
+-------+------------------+
| France|191.66666666666666|
|  Italy|208.33333333333334|
|    USA|             300.0|
|  Japan|133.33333333333334|
+-------+------------------+

Filter¶

In [15]:
df.filter(col("sales") > 200).show()
+-------+-----+-----------+-----+
|country|month|   category|sales|
+-------+-----+-----------+-----+
| France|01_23|Accessories|  240|
| France|02_23|Accessories|  360|
|  Italy|01_23|Accessories|  260|
|  Italy|02_23|Accessories|  340|
|  Italy|03_23|Accessories|  300|
|    USA|01_23|Accessories|  280|
|    USA|02_23|Accessories|  300|
|    USA|03_23|Accessories|  320|
+-------+-----+-----------+-----+

In [16]:
df.filter(col("country") == "Italy").show()
+-------+-----+-----------+-----+
|country|month|   category|sales|
+-------+-----+-----------+-----+
|  Italy|01_23|   Hardware|  130|
|  Italy|02_23|   Hardware|   70|
|  Italy|03_23|   Hardware|  150|
|  Italy|01_23|Accessories|  260|
|  Italy|02_23|Accessories|  340|
|  Italy|03_23|Accessories|  300|
+-------+-----+-----------+-----+

In [17]:
df.filter((col("country") == "Italy") & (col("category") == "Hardware")).show()
+-------+-----+--------+-----+
|country|month|category|sales|
+-------+-----+--------+-----+
|  Italy|01_23|Hardware|  130|
|  Italy|02_23|Hardware|   70|
|  Italy|03_23|Hardware|  150|
+-------+-----+--------+-----+

In [18]:
df.filter(col("country").isin(["USA", "Japan"])).show()
+-------+-----+-----------+-----+
|country|month|   category|sales|
+-------+-----+-----------+-----+
|    USA|01_23|Accessories|  280|
|    USA|02_23|Accessories|  300|
|    USA|03_23|Accessories|  320|
|  Japan|01_23|   Hardware|  180|
|  Japan|02_23|   Hardware|  100|
|  Japan|03_23|   Hardware|  120|
+-------+-----+-----------+-----+

In [19]:
df.filter(col("month").startswith("01")).show()
+-------+-----+-----------+-----+
|country|month|   category|sales|
+-------+-----+-----------+-----+
| France|01_23|   Hardware|  120|
| France|01_23|Accessories|  240|
|  Italy|01_23|   Hardware|  130|
|  Italy|01_23|Accessories|  260|
|    USA|01_23|Accessories|  280|
|  Japan|01_23|   Hardware|  180|
+-------+-----+-----------+-----+

Aggregierung¶

In [20]:
df.groupBy(["country", "category"]).agg(f.sum("sales")).orderBy(["country", "category"]).show()
+-------+-----------+----------+
|country|   category|sum(sales)|
+-------+-----------+----------+
| France|Accessories|       800|
| France|   Hardware|       350|
|  Italy|Accessories|       900|
|  Italy|   Hardware|       350|
|  Japan|   Hardware|       400|
|    USA|Accessories|       900|
+-------+-----------+----------+

In [21]:
df.groupBy(["country", "category"]).agg(f.sum("sales").alias("total_sales")).orderBy(f.desc("total_sales")).show()
+-------+-----------+-----------+
|country|   category|total_sales|
+-------+-----------+-----------+
|  Italy|Accessories|        900|
|    USA|Accessories|        900|
| France|Accessories|        800|
|  Japan|   Hardware|        400|
| France|   Hardware|        350|
|  Italy|   Hardware|        350|
+-------+-----------+-----------+

In [22]:
df.groupBy("country").agg(f.countDistinct("category")).show()
+-------+---------------+
|country|count(category)|
+-------+---------------+
| France|              2|
|  Italy|              2|
|    USA|              1|
|  Japan|              1|
+-------+---------------+

In [23]:
df.groupBy("country").agg(f.countDistinct("category").alias("Number_of_distinct_categories")).show()
+-------+-----------------------------+
|country|Number_of_distinct_categories|
+-------+-----------------------------+
| France|                            2|
|  Italy|                            2|
|    USA|                            1|
|  Japan|                            1|
+-------+-----------------------------+

Datenaufbereitung¶

In [24]:
countries_dict = {"France": "Europe", "Italy": "Europe", "USA": "Northamerica", "Japan": "Asia"}
In [25]:
df = df.withColumn("region", f.col("country"))
df.printSchema()
root
 |-- country: string (nullable = true)
 |-- month: string (nullable = true)
 |-- category: string (nullable = true)
 |-- sales: long (nullable = true)
 |-- region: string (nullable = true)

Werte mittels Dictionary ersetzen¶

In [26]:
df = df.replace(countries_dict, subset="region")
df.show()
+-------+-----+-----------+-----+------------+
|country|month|   category|sales|      region|
+-------+-----+-----------+-----+------------+
| France|01_23|   Hardware|  120|      Europe|
| France|02_23|   Hardware|   90|      Europe|
| France|03_23|   Hardware|  140|      Europe|
| France|01_23|Accessories|  240|      Europe|
| France|02_23|Accessories|  360|      Europe|
| France|03_23|Accessories|  200|      Europe|
|  Italy|01_23|   Hardware|  130|      Europe|
|  Italy|02_23|   Hardware|   70|      Europe|
|  Italy|03_23|   Hardware|  150|      Europe|
|  Italy|01_23|Accessories|  260|      Europe|
|  Italy|02_23|Accessories|  340|      Europe|
|  Italy|03_23|Accessories|  300|      Europe|
|    USA|01_23|Accessories|  280|Northamerica|
|    USA|02_23|Accessories|  300|Northamerica|
|    USA|03_23|Accessories|  320|Northamerica|
|  Japan|01_23|   Hardware|  180|        Asia|
|  Japan|02_23|   Hardware|  100|        Asia|
|  Japan|03_23|   Hardware|  120|        Asia|
+-------+-----+-----------+-----+------------+

In [27]:
df_avg_sales_by_country_category = df.groupBy(["country", "category"]).agg(f.avg("sales").alias("avg_sales")).orderBy(["country", "category"])
df_avg_sales_by_country_category.show() 
+-------+-----------+------------------+
|country|   category|         avg_sales|
+-------+-----------+------------------+
| France|Accessories| 266.6666666666667|
| France|   Hardware|116.66666666666667|
|  Italy|Accessories|             300.0|
|  Italy|   Hardware|116.66666666666667|
|  Japan|   Hardware|133.33333333333334|
|    USA|Accessories|             300.0|
+-------+-----------+------------------+

In [28]:
df_avg_sales_by_country_category = df_avg_sales_by_country_category.withColumn("avg_sales", f.round("avg_sales").cast("integer"))
df_avg_sales_by_country_category.show()
+-------+-----------+---------+
|country|   category|avg_sales|
+-------+-----------+---------+
| France|Accessories|      267|
| France|   Hardware|      117|
|  Italy|Accessories|      300|
|  Italy|   Hardware|      117|
|  Japan|   Hardware|      133|
|    USA|Accessories|      300|
+-------+-----------+---------+

JOIN von zwei Dataframes¶

In [29]:
df = df.join(df_avg_sales_by_country_category, on = ["country", "category"], how = "inner")
df.show()
+-------+-----------+-----+-----+------------+---------+
|country|   category|month|sales|      region|avg_sales|
+-------+-----------+-----+-----+------------+---------+
| France|   Hardware|01_23|  120|      Europe|      117|
| France|   Hardware|02_23|   90|      Europe|      117|
| France|   Hardware|03_23|  140|      Europe|      117|
| France|Accessories|01_23|  240|      Europe|      267|
| France|Accessories|02_23|  360|      Europe|      267|
| France|Accessories|03_23|  200|      Europe|      267|
|  Italy|Accessories|01_23|  260|      Europe|      300|
|  Italy|Accessories|02_23|  340|      Europe|      300|
|  Italy|Accessories|03_23|  300|      Europe|      300|
|    USA|Accessories|01_23|  280|Northamerica|      300|
|    USA|Accessories|02_23|  300|Northamerica|      300|
|    USA|Accessories|03_23|  320|Northamerica|      300|
|  Italy|   Hardware|01_23|  130|      Europe|      117|
|  Italy|   Hardware|02_23|   70|      Europe|      117|
|  Italy|   Hardware|03_23|  150|      Europe|      117|
|  Japan|   Hardware|01_23|  180|        Asia|      133|
|  Japan|   Hardware|02_23|  100|        Asia|      133|
|  Japan|   Hardware|03_23|  120|        Asia|      133|
+-------+-----------+-----+-----+------------+---------+

In [30]:
df = df.withColumn("difference_from_avg", col("sales") - col("avg_sales"))
df.show()
+-------+-----------+-----+-----+------------+---------+-------------------+
|country|   category|month|sales|      region|avg_sales|difference_from_avg|
+-------+-----------+-----+-----+------------+---------+-------------------+
| France|   Hardware|01_23|  120|      Europe|      117|                  3|
| France|   Hardware|02_23|   90|      Europe|      117|                -27|
| France|   Hardware|03_23|  140|      Europe|      117|                 23|
| France|Accessories|01_23|  240|      Europe|      267|                -27|
| France|Accessories|02_23|  360|      Europe|      267|                 93|
| France|Accessories|03_23|  200|      Europe|      267|                -67|
|  Italy|Accessories|01_23|  260|      Europe|      300|                -40|
|  Italy|Accessories|02_23|  340|      Europe|      300|                 40|
|  Italy|Accessories|03_23|  300|      Europe|      300|                  0|
|    USA|Accessories|01_23|  280|Northamerica|      300|                -20|
|    USA|Accessories|02_23|  300|Northamerica|      300|                  0|
|    USA|Accessories|03_23|  320|Northamerica|      300|                 20|
|  Italy|   Hardware|01_23|  130|      Europe|      117|                 13|
|  Italy|   Hardware|02_23|   70|      Europe|      117|                -47|
|  Italy|   Hardware|03_23|  150|      Europe|      117|                 33|
|  Japan|   Hardware|01_23|  180|        Asia|      133|                 47|
|  Japan|   Hardware|02_23|  100|        Asia|      133|                -33|
|  Japan|   Hardware|03_23|  120|        Asia|      133|                -13|
+-------+-----------+-----+-----+------------+---------+-------------------+

Werte ersetzen mit when()¶

In [31]:
df = df.withColumn('performance', f.when(col('difference_from_avg') > 0, "above average").otherwise("below average"))
df.show()
+-------+-----------+-----+-----+------------+---------+-------------------+-------------+
|country|   category|month|sales|      region|avg_sales|difference_from_avg|  performance|
+-------+-----------+-----+-----+------------+---------+-------------------+-------------+
| France|   Hardware|01_23|  120|      Europe|      117|                  3|above average|
| France|   Hardware|02_23|   90|      Europe|      117|                -27|below average|
| France|   Hardware|03_23|  140|      Europe|      117|                 23|above average|
| France|Accessories|01_23|  240|      Europe|      267|                -27|below average|
| France|Accessories|02_23|  360|      Europe|      267|                 93|above average|
| France|Accessories|03_23|  200|      Europe|      267|                -67|below average|
|  Italy|Accessories|01_23|  260|      Europe|      300|                -40|below average|
|  Italy|Accessories|02_23|  340|      Europe|      300|                 40|above average|
|  Italy|Accessories|03_23|  300|      Europe|      300|                  0|below average|
|    USA|Accessories|01_23|  280|Northamerica|      300|                -20|below average|
|    USA|Accessories|02_23|  300|Northamerica|      300|                  0|below average|
|    USA|Accessories|03_23|  320|Northamerica|      300|                 20|above average|
|  Italy|   Hardware|01_23|  130|      Europe|      117|                 13|above average|
|  Italy|   Hardware|02_23|   70|      Europe|      117|                -47|below average|
|  Italy|   Hardware|03_23|  150|      Europe|      117|                 33|above average|
|  Japan|   Hardware|01_23|  180|        Asia|      133|                 47|above average|
|  Japan|   Hardware|02_23|  100|        Asia|      133|                -33|below average|
|  Japan|   Hardware|03_23|  120|        Asia|      133|                -13|below average|
+-------+-----------+-----+-----+------------+---------+-------------------+-------------+

In [32]:
df = df.withColumn("performance", f.when(col("difference_from_avg") == 0, "average").otherwise(col("performance")))
df.show()
+-------+-----------+-----+-----+------------+---------+-------------------+-------------+
|country|   category|month|sales|      region|avg_sales|difference_from_avg|  performance|
+-------+-----------+-----+-----+------------+---------+-------------------+-------------+
| France|   Hardware|01_23|  120|      Europe|      117|                  3|above average|
| France|   Hardware|02_23|   90|      Europe|      117|                -27|below average|
| France|   Hardware|03_23|  140|      Europe|      117|                 23|above average|
| France|Accessories|01_23|  240|      Europe|      267|                -27|below average|
| France|Accessories|02_23|  360|      Europe|      267|                 93|above average|
| France|Accessories|03_23|  200|      Europe|      267|                -67|below average|
|  Italy|Accessories|01_23|  260|      Europe|      300|                -40|below average|
|  Italy|Accessories|02_23|  340|      Europe|      300|                 40|above average|
|  Italy|Accessories|03_23|  300|      Europe|      300|                  0|      average|
|    USA|Accessories|01_23|  280|Northamerica|      300|                -20|below average|
|    USA|Accessories|02_23|  300|Northamerica|      300|                  0|      average|
|    USA|Accessories|03_23|  320|Northamerica|      300|                 20|above average|
|  Italy|   Hardware|01_23|  130|      Europe|      117|                 13|above average|
|  Italy|   Hardware|02_23|   70|      Europe|      117|                -47|below average|
|  Italy|   Hardware|03_23|  150|      Europe|      117|                 33|above average|
|  Japan|   Hardware|01_23|  180|        Asia|      133|                 47|above average|
|  Japan|   Hardware|02_23|  100|        Asia|      133|                -33|below average|
|  Japan|   Hardware|03_23|  120|        Asia|      133|                -13|below average|
+-------+-----------+-----+-----+------------+---------+-------------------+-------------+

In [33]:
df = df.withColumnRenamed("month", "month_year")
df.printSchema()
root
 |-- country: string (nullable = true)
 |-- category: string (nullable = true)
 |-- month_year: string (nullable = true)
 |-- sales: long (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_sales: integer (nullable = true)
 |-- difference_from_avg: long (nullable = true)
 |-- performance: string (nullable = false)

In [34]:
df = df.withColumn("month", f.substring("month_year", 1,2))\
        .withColumn("year", f.substring("month_year", 4,2))
df.show()
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+
|country|   category|month_year|sales|      region|avg_sales|difference_from_avg|  performance|month|year|
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+
| France|   Hardware|     01_23|  120|      Europe|      117|                  3|above average|   01|  23|
| France|   Hardware|     02_23|   90|      Europe|      117|                -27|below average|   02|  23|
| France|   Hardware|     03_23|  140|      Europe|      117|                 23|above average|   03|  23|
| France|Accessories|     01_23|  240|      Europe|      267|                -27|below average|   01|  23|
| France|Accessories|     02_23|  360|      Europe|      267|                 93|above average|   02|  23|
| France|Accessories|     03_23|  200|      Europe|      267|                -67|below average|   03|  23|
|  Italy|Accessories|     01_23|  260|      Europe|      300|                -40|below average|   01|  23|
|  Italy|Accessories|     02_23|  340|      Europe|      300|                 40|above average|   02|  23|
|  Italy|Accessories|     03_23|  300|      Europe|      300|                  0|      average|   03|  23|
|    USA|Accessories|     01_23|  280|Northamerica|      300|                -20|below average|   01|  23|
|    USA|Accessories|     02_23|  300|Northamerica|      300|                  0|      average|   02|  23|
|    USA|Accessories|     03_23|  320|Northamerica|      300|                 20|above average|   03|  23|
|  Italy|   Hardware|     01_23|  130|      Europe|      117|                 13|above average|   01|  23|
|  Italy|   Hardware|     02_23|   70|      Europe|      117|                -47|below average|   02|  23|
|  Italy|   Hardware|     03_23|  150|      Europe|      117|                 33|above average|   03|  23|
|  Japan|   Hardware|     01_23|  180|        Asia|      133|                 47|above average|   01|  23|
|  Japan|   Hardware|     02_23|  100|        Asia|      133|                -33|below average|   02|  23|
|  Japan|   Hardware|     03_23|  120|        Asia|      133|                -13|below average|   03|  23|
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+

In [35]:
df = df.withColumn("year", f.concat(lit("20"), col("year")))
df.show()
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+
|country|   category|month_year|sales|      region|avg_sales|difference_from_avg|  performance|month|year|
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+
| France|   Hardware|     01_23|  120|      Europe|      117|                  3|above average|   01|2023|
| France|   Hardware|     02_23|   90|      Europe|      117|                -27|below average|   02|2023|
| France|   Hardware|     03_23|  140|      Europe|      117|                 23|above average|   03|2023|
| France|Accessories|     01_23|  240|      Europe|      267|                -27|below average|   01|2023|
| France|Accessories|     02_23|  360|      Europe|      267|                 93|above average|   02|2023|
| France|Accessories|     03_23|  200|      Europe|      267|                -67|below average|   03|2023|
|  Italy|Accessories|     01_23|  260|      Europe|      300|                -40|below average|   01|2023|
|  Italy|Accessories|     02_23|  340|      Europe|      300|                 40|above average|   02|2023|
|  Italy|Accessories|     03_23|  300|      Europe|      300|                  0|      average|   03|2023|
|    USA|Accessories|     01_23|  280|Northamerica|      300|                -20|below average|   01|2023|
|    USA|Accessories|     02_23|  300|Northamerica|      300|                  0|      average|   02|2023|
|    USA|Accessories|     03_23|  320|Northamerica|      300|                 20|above average|   03|2023|
|  Italy|   Hardware|     01_23|  130|      Europe|      117|                 13|above average|   01|2023|
|  Italy|   Hardware|     02_23|   70|      Europe|      117|                -47|below average|   02|2023|
|  Italy|   Hardware|     03_23|  150|      Europe|      117|                 33|above average|   03|2023|
|  Japan|   Hardware|     01_23|  180|        Asia|      133|                 47|above average|   01|2023|
|  Japan|   Hardware|     02_23|  100|        Asia|      133|                -33|below average|   02|2023|
|  Japan|   Hardware|     03_23|  120|        Asia|      133|                -13|below average|   03|2023|
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+

In [36]:
df.printSchema()
root
 |-- country: string (nullable = true)
 |-- category: string (nullable = true)
 |-- month_year: string (nullable = true)
 |-- sales: long (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_sales: integer (nullable = true)
 |-- difference_from_avg: long (nullable = true)
 |-- performance: string (nullable = false)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)

In [42]:
df = df.withColumn("year", col("year").cast("Integer"))
df.printSchema()
root
 |-- country: string (nullable = true)
 |-- category: string (nullable = true)
 |-- month_year: string (nullable = true)
 |-- sales: long (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_sales: integer (nullable = true)
 |-- difference_from_avg: long (nullable = true)
 |-- performance: string (nullable = false)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)

In [40]:
df = df.withColumn("month", col("month").cast("Integer"))
df.printSchema()
root
 |-- country: string (nullable = true)
 |-- category: string (nullable = true)
 |-- month_year: string (nullable = true)
 |-- sales: long (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_sales: integer (nullable = true)
 |-- difference_from_avg: long (nullable = true)
 |-- performance: string (nullable = false)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)

In [41]:
df.show()
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+
|country|   category|month_year|sales|      region|avg_sales|difference_from_avg|  performance|month|year|
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+
| France|   Hardware|     01_23|  120|      Europe|      117|                  3|above average|    1|   1|
| France|   Hardware|     02_23|   90|      Europe|      117|                -27|below average|    2|   2|
| France|   Hardware|     03_23|  140|      Europe|      117|                 23|above average|    3|   3|
| France|Accessories|     01_23|  240|      Europe|      267|                -27|below average|    1|   1|
| France|Accessories|     02_23|  360|      Europe|      267|                 93|above average|    2|   2|
| France|Accessories|     03_23|  200|      Europe|      267|                -67|below average|    3|   3|
|  Italy|Accessories|     01_23|  260|      Europe|      300|                -40|below average|    1|   1|
|  Italy|Accessories|     02_23|  340|      Europe|      300|                 40|above average|    2|   2|
|  Italy|Accessories|     03_23|  300|      Europe|      300|                  0|      average|    3|   3|
|    USA|Accessories|     01_23|  280|Northamerica|      300|                -20|below average|    1|   1|
|    USA|Accessories|     02_23|  300|Northamerica|      300|                  0|      average|    2|   2|
|    USA|Accessories|     03_23|  320|Northamerica|      300|                 20|above average|    3|   3|
|  Italy|   Hardware|     01_23|  130|      Europe|      117|                 13|above average|    1|   1|
|  Italy|   Hardware|     02_23|   70|      Europe|      117|                -47|below average|    2|   2|
|  Italy|   Hardware|     03_23|  150|      Europe|      117|                 33|above average|    3|   3|
|  Japan|   Hardware|     01_23|  180|        Asia|      133|                 47|above average|    1|   1|
|  Japan|   Hardware|     02_23|  100|        Asia|      133|                -33|below average|    2|   2|
|  Japan|   Hardware|     03_23|  120|        Asia|      133|                -13|below average|    3|   3|
+-------+-----------+----------+-----+------------+---------+-------------------+-------------+-----+----+

Spark SQL¶

In [43]:
df.createOrReplaceTempView("data")
In [46]:
spark.sql(""" Select region,
                     country,
                     category,
                     month_year,
                     sales
                from data
          """).show()
+------------+-------+-----------+----------+-----+
|      region|country|   category|month_year|sales|
+------------+-------+-----------+----------+-----+
|      Europe| France|   Hardware|     01_23|  120|
|      Europe| France|   Hardware|     02_23|   90|
|      Europe| France|   Hardware|     03_23|  140|
|      Europe| France|Accessories|     01_23|  240|
|      Europe| France|Accessories|     02_23|  360|
|      Europe| France|Accessories|     03_23|  200|
|      Europe|  Italy|Accessories|     01_23|  260|
|      Europe|  Italy|Accessories|     02_23|  340|
|      Europe|  Italy|Accessories|     03_23|  300|
|Northamerica|    USA|Accessories|     01_23|  280|
|Northamerica|    USA|Accessories|     02_23|  300|
|Northamerica|    USA|Accessories|     03_23|  320|
|      Europe|  Italy|   Hardware|     01_23|  130|
|      Europe|  Italy|   Hardware|     02_23|   70|
|      Europe|  Italy|   Hardware|     03_23|  150|
|        Asia|  Japan|   Hardware|     01_23|  180|
|        Asia|  Japan|   Hardware|     02_23|  100|
|        Asia|  Japan|   Hardware|     03_23|  120|
+------------+-------+-----------+----------+-----+

In [47]:
spark.sql(""" Select region,
                     country,
                     category,
                     month_year,
                     sales
                from data
                where performance == "above average"
          """).show()
+------------+-------+-----------+----------+-----+
|      region|country|   category|month_year|sales|
+------------+-------+-----------+----------+-----+
|      Europe| France|   Hardware|     01_23|  120|
|      Europe| France|   Hardware|     03_23|  140|
|      Europe| France|Accessories|     02_23|  360|
|      Europe|  Italy|Accessories|     02_23|  340|
|Northamerica|    USA|Accessories|     03_23|  320|
|      Europe|  Italy|   Hardware|     01_23|  130|
|      Europe|  Italy|   Hardware|     03_23|  150|
|        Asia|  Japan|   Hardware|     01_23|  180|
+------------+-------+-----------+----------+-----+