defined
class
A
scala>
case
class
B
(c: List[A], d: Map[String, A], e: Map[Int, String], f: Map[A, String])
defined
class
B
scala>
def
a_gen(i: Int) = A(s
"str_$i"
, i)
a_gen: (i: Int)A
scala>
def
b_gen(i: Int) = B((
1
to
10
).map(a_gen).toList, (
1
to
10
).map(j => s
"key_$j"
-> a_gen(j)).toMap, (
1
to
10
).map(j => j -> s
"value_$j"
).toMap, (
1
to
10
).map(j => a_gen(j) -> s
"value_$j"
).toMap)
b_gen: (i: Int)B
scala>
val
data = (
1
to
10
).map(b_gen)
scala>
val
df = spark.createDataFrame(data)
df: org.apache.spark.sql.DataFrame = [c: array<struct<a:string,b:int>>, d: map<string,struct<a:string,b:int>> ...
2
more fields]
scala> df.show
+--------------------+--------------------+--------------------+--------------------+
| c| d| e| f|
+--------------------+--------------------+--------------------+--------------------+
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
|[[str_1,
1
], [str...|[key_2 -> [str_2,...|[
5
-> value_5,
10.
..|[[str_8,
8
] -> va...|
+--------------------+--------------------+--------------------+--------------------+
scala> df.printSchema
|-- c: array (nullable =
true
)
| |-- element: struct (containsNull =
true
)
| | |-- a: string (nullable =
true
)
| | |-- b: integer (nullable =
false
)
|-- d: map (nullable =
true
)
| |-- key: string
| |-- value: struct (valueContainsNull =
true
)
| | |-- a: string (nullable =
true
)
| | |-- b: integer (nullable =
false
)
|-- e: map (nullable =
true
)
| |-- key: integer
| |-- value: string (valueContainsNull =
true
)
|-- f: map (nullable =
true
)
| |-- key: struct
| |-- value: string (valueContainsNull =
true
)
| | |-- a: string (nullable =
true
)
| | |-- b: integer (nullable =
false
)
数组\列表
array
的索引方式
我们首先来看一下数组\列表
array
的索引方式:
scala> df.select("c.a").show(10, false)
+-----------------------------------------------------------------------+
|a |
+-----------------------------------------------------------------------+
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
+-----------------------------------------------------------------------+
scala> df.select("c.a").printSchema
|-- a: array (nullable = true)
| |-- element: string (containsNull = true)
scala> df.select(expr("explode(c.a)")).show
+------+
| col|
+------+
| str_1|
| str_2|
| str_3|
| str_4|
| str_5|
| str_6|
| str_7|
| str_8|
| str_9|
|str_10|
| str_1|
| str_2|
| str_3|
| str_4|
| str_5|
| str_6|
| str_7|
| str_8|
| str_9|
|str_10|
+------+
only showing top 20 rows
scala> df.select(expr("explode(c.a)")).printSchema
|-- col: string (nullable = true)
scala> df.select(expr("explode(c)")).show
+------------+
| col|
+------------+
| [str_1, 1]|
| [str_2, 2]|
| [str_3, 3]|
| [str_4, 4]|
| [str_5, 5]|
| [str_6, 6]|
| [str_7, 7]|
| [str_8, 8]|
| [str_9, 9]|
|[str_10, 10]|
| [str_1, 1]|
| [str_2, 2]|
| [str_3, 3]|
| [str_4, 4]|
| [str_5, 5]|
| [str_6, 6]|
| [str_7, 7]|
| [str_8, 8]|
| [str_9, 9]|
|[str_10, 10]|
+------------+
only showing top 20 rows
scala> df.select(expr("explode(c)")).printSchema
|-- col: struct (nullable = true)
| |-- a: string (nullable = true)
| |-- b: integer (nullable = false)
scala> df.select(expr("inline(c)")).show
+------+---+
| a| b|
+------+---+
| str_1| 1|
| str_2| 2|
| str_3| 3|
| str_4| 4|
| str_5| 5|
| str_6| 6|
| str_7| 7|
| str_8| 8|
| str_9| 9|
|str_10| 10|
| str_1| 1|
| str_2| 2|
| str_3| 3|
| str_4| 4|
| str_5| 5|
| str_6| 6|
| str_7| 7|
| str_8| 8|
| str_9| 9|
|str_10| 10|
+------+---+
only showing top 20 rows
scala> df.select(expr("inline(c)")).printSchema
|-- a: string (nullable = true)
|-- b: integer (nullable = false)
scala> df.select(expr("posexplode(d)")).printSchema
|-- pos: integer (nullable = false)
|-- key: string (nullable = false)
|-- value: struct (nullable = true)
| |-- a: string (nullable = true)
| |-- b: integer (nullable = false)
scala> df.select(expr("posexplode(e)")).printSchema
|-- pos: integer (nullable = false)
|-- key: integer (nullable = false)
|-- value: string (nullable = true)
scala> df.select(expr("posexplode(f)")).show
+---+------------+--------+
|pos| key| value|
+---+------------+--------+
| 0| [str_8, 8]| value_8|
| 1|[str_10, 10]|value_10|
| 2| [str_3, 3]| value_3|
| 3| [str_1, 1]| value_1|
| 4| [str_6, 6]| value_6|
| 5| [str_5, 5]| value_5|
| 6| [str_7, 7]| value_7|
| 7| [str_2, 2]| value_2|
| 8| [str_4, 4]| value_4|
| 9| [str_9, 9]| value_9|
| 0| [str_8, 8]| value_8|
| 1|[str_10, 10]|value_10|
| 2| [str_3, 3]| value_3|
| 3| [str_1, 1]| value_1|
| 4| [str_6, 6]| value_6|
| 5| [str_5, 5]| value_5|
| 6| [str_7, 7]| value_7|
| 7| [str_2, 2]| value_2|
| 8| [str_4, 4]| value_4|
| 9| [str_9, 9]| value_9|
+---+------------+--------+
scala> df.select(expr("posexplode(f)")).printSchema
|-- pos: integer (nullable = false)
|-- key: struct (nullable = false)
| |-- a: string (nullable = true)
| |-- b: integer (nullable = false)
|-- value: string (nullable = true)
scala> df.select("d.key_1").show
+----------+
| key_1|
+----------+
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
+----------+
scala> df.select("d.key_1").printSchema
|-- key_1: struct (nullable = true)
| |-- a: string (nullable = true)
| |-- b: integer (nullable = false)
scala> df.select("e.1").show
+-------+
| 1|
+-------+
|value_1|
|value_1|
|value_1|
|value_1|
|value_1|
|value_1|
|value_1|
|value_1|
|value_1|
|value_1|
+-------+
scala> df.select("e.1").printSchema
|-- 1: string (nullable = true)
在学习了struct和array的取值后,再看map的取值是不是就特别简单了,下面我们来看一个难一点的例子
最有意思的就是f这个map了,我们用struct作为map的key
这种情况下,我们可以用namedExpressionSeq表达式类构造这个struct
scala> df.select(expr("f[('str_1' AS a, 1 AS b)]")).show
+---------------------------------------------+
|f[named_struct(a, str_1 AS `a`, b, 1 AS `b`)]|
+---------------------------------------------+
| value_1|
| value_1|
| value_1|
| value_1|
| value_1|
| value_1|
| value_1|
| value_1|
| value_1|
| value_1|
+---------------------------------------------+
scala> df.select(expr("f[('str_1' AS a, 1 AS b)]")).printSchema
|-- f[named_struct(a, str_1 AS `a`, b, 1 AS `b`)]: string (nullable = true)
以上这种构造方式当然不是凭空想出来的,依据呢当然还是我之前提到的另一个博客里介绍的查看方式https://blog.csdn.net/wang_wbq/article/details/79673780
我们可以在SqlBase.g4
文件中找到以下词法描述
primaryExpression
: #前面太长不看
| '(' namedExpression (',' namedExpression)+ ')' #rowConstructor
#中间太长不看
| value=primaryExpression '[' index=valueExpression ']' #subscript
#后面太长不看
valueExpression
: primaryExpression
#后面太长不看
namedExpression
: expression (AS? (identifier | identifierList))?
从上面我们可以看出:
1、中括号里需要放置valueExpression
2、valueExpression
可以是一个primaryExpression
3、primaryExpression
可以是一个'(' namedExpression (',' namedExpression)+ ')'
结构
4、namedExpression
又是一个exp AS alias
的结构
因此,显而易见,我们可以用这种方式来构造结构体去匹配map的key