-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathd013_orderby_vs_sort.py
74 lines (50 loc) · 2.17 KB
/
d013_orderby_vs_sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
https://sparkbyexamples.com/pyspark/pyspark-orderby-and-sort-explained/
https://github.com/spark-examples/pyspark-examples/blob/master/pyspark-orderby.py
3 ways to sort
sort()
orderBy()
RawSQL
DataFrame.sort() default ascending, support multiple column
same usage provide by DataFrame.orderBy()
specifically assgin ascending/decending
df.sort(df.department.asc() / .desc() )
Also support Raw SQL
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, asc, desc
spark = SparkSession.builder.appName("SparkByExamples.com").getOrCreate()
simpleData = [
("James", "Sales", "NY", 90000, 34, 10000),
("Michael", "Sales", "NY", 86000, 56, 20000),
("Robert", "Sales", "CA", 81000, 30, 23000),
("Maria", "Finance", "CA", 90000, 24, 23000),
("Raman", "Finance", "CA", 99000, 40, 24000),
("Scott", "Finance", "NY", 83000, 36, 19000),
("Jen", "Finance", "NY", 79000, 53, 15000),
("Jeff", "Marketing", "CA", 80000, 25, 18000),
("Kumar", "Marketing", "NY", 91000, 50, 21000),
]
columns = ["employee_name", "department", "state", "salary", "age", "bonus"]
df = spark.createDataFrame(data=simpleData, schema=columns)
df.printSchema()
df.show(truncate=False)
df.sort("department", "state").show(truncate=False)
df.sort(col("department"), col("state")).show(truncate=False)
df.orderBy("department", "state").show(truncate=False)
df.orderBy(col("department"), col("state")).show(truncate=False)
# assign ascending or descending
df.sort(df.department.asc(), df.state.asc()).show(truncate=False)
df.sort(col("department").asc(), col("state").asc()).show(truncate=False)
df.orderBy(col("department").asc(), col("state").asc()).show(truncate=False)
df.sort(df.department.asc(), df.state.desc()).show(truncate=False)
df.sort(col("department").asc(), col("state").desc()).show(truncate=False)
df.orderBy(col("department").asc(), col("state").desc()).show(truncate=False)
df.createOrReplaceTempView("EMP")
df.select(
"employee_name", asc("department"), desc("state"), "salary", "age", "bonus"
).show(truncate=False)
spark.sql(
"select employee_name,department,state,salary,age,bonus from EMP ORDER BY department asc"
).show(truncate=False)