I'm following delta lake liquid clustering document, and got an error [PARSE_SYNTAX_ERROR] Syntax error at or near 'CLUSTER'.
. As I understood, the following code should work since it is supported by delta lake 3.3.
This is the python library version I'm using.
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, TimestampType, IntegerType, ShortType
REQUIRED_JARS = [
"io.delta:delta-spark_2.12:3.3.0",
"io.delta:delta-storage:3.3.0",
]
spark = SparkSession.builder \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "64g") \
.config("spark.log.level", "ERROR") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.jars.packages", ",".join(REQUIRED_JARS)) \
.getOrCreate()
DeltaTable.createOrReplace() \
.tableName("SAMPLE") \
.addColumn("CLUSTER_TYPE", dataType = StringType()) \
.addColumn("TIMESTAMP", dataType = TimestampType()) \
.addColumn("CLUSTER_NAME", dataType = StringType()) \
.clusterBy("DATA_TIMESTAMP", "CLUSTER_TYPE") \
.execute()
spark.sql("ALTER TABLE SAMPLE CLUSTER BY ('CLUSTER_TYPE')");
---------------------------------------------------------------------------
ParseException Traceback (most recent call last)
Cell In[11], line 2
1 spark.sql(f"ALTER TABLE SAMPLE SET TBLPROPERTIES ('delta.enableDeletionVectors' = false);")
----> 2 spark.sql(f"""ALTER TABLE SAMPLE
3 CLUSTER BY ('CLUSTER_TYPE');""")
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/pyspark/sql/session.py:1631, in SparkSession.sql(self, sqlQuery, args, **kwargs)
1627 assert self._jvm is not None
1628 litArgs = self._jvm.PythonUtils.toArray(
1629 [_to_java_column(lit(v)) for v in (args or [])]
1630 )
-> 1631 return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
1632 finally:
1633 if len(kwargs) > 0:
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/pyspark/errors/exceptions/captured.py:185, in capture_sql_exception.<locals>.deco(*a, **kw)
181 converted = convert_exception(e.java_exception)
182 if not isinstance(converted, UnknownException):
183 # Hide where the exception came from that shows a non-Pythonic
184 # JVM exception message.
--> 185 raise converted from None
186 else:
187 raise
ParseException:
[PARSE_SYNTAX_ERROR] Syntax error at or near 'CLUSTER'.(line 2, pos 0)
== SQL ==
ALTER TABLE SAMPLE
CLUSTER BY ('CLUSTER_TYPE');
^^^
Try deleting the quotes and use full path if possible
spark.sql("ALTER TABLE catalog.db.SAMPLE CLUSTER BY (CLUSTER_TYPE) ");