[HUDI-3511] Add call procedure for MetadataCommand (#6018)
This commit is contained in:
@@ -66,6 +66,13 @@ object HoodieProcedures {
|
||||
mapBuilder.put(ShowBootstrapPartitionsProcedure.NAME, ShowBootstrapPartitionsProcedure.builder)
|
||||
mapBuilder.put(UpgradeTableProcedure.NAME, UpgradeTableProcedure.builder)
|
||||
mapBuilder.put(DowngradeTableProcedure.NAME, DowngradeTableProcedure.builder)
|
||||
mapBuilder.put(ListMetadataFilesProcedure.NAME, ListMetadataFilesProcedure.builder)
|
||||
mapBuilder.put(ListMetadataPartitionsProcedure.NAME, ListMetadataPartitionsProcedure.builder)
|
||||
mapBuilder.put(MetadataCreateProcedure.NAME, MetadataCreateProcedure.builder)
|
||||
mapBuilder.put(MetadataDeleteProcedure.NAME, MetadataDeleteProcedure.builder)
|
||||
mapBuilder.put(MetadataInitProcedure.NAME, MetadataInitProcedure.builder)
|
||||
mapBuilder.put(ShowMetadataStatsProcedure.NAME, ShowMetadataStatsProcedure.builder)
|
||||
mapBuilder.put(ValidateMetadataFilesProcedure.NAME, ValidateMetadataFilesProcedure.builder)
|
||||
mapBuilder.build
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hadoop.fs.{FileStatus, Path}
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.HoodieTimer
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.metadata.HoodieBackedTableMetadata
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
|
||||
import java.util
|
||||
import java.util.function.Supplier
|
||||
|
||||
class ListMetadataFilesProcedure() extends BaseProcedure with ProcedureBuilder with Logging {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None),
|
||||
ProcedureParameter.optional(1, "partition", DataTypes.StringType, None)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("file_path", DataTypes.StringType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val table = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
val partition = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String]
|
||||
|
||||
val basePath = getBasePath(table)
|
||||
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||
val config = HoodieMetadataConfig.newBuilder.enable(true).build
|
||||
val metaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf),
|
||||
config, basePath, "/tmp")
|
||||
if (!metaReader.enabled){
|
||||
throw new HoodieException(s"Metadata Table not enabled/initialized.")
|
||||
}
|
||||
|
||||
val timer = new HoodieTimer().startTimer
|
||||
val statuses = metaReader.getAllFilesInPartition(new Path(basePath, partition))
|
||||
logDebug("Took " + timer.endTimer + " ms")
|
||||
|
||||
val rows = new util.ArrayList[Row]
|
||||
statuses.toStream.sortBy(p => p.getPath.getName).foreach((f: FileStatus) => {
|
||||
rows.add(Row(f.getPath.getName))
|
||||
})
|
||||
rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList
|
||||
}
|
||||
|
||||
override def build: Procedure = new ListMetadataFilesProcedure()
|
||||
}
|
||||
|
||||
object ListMetadataFilesProcedure {
|
||||
val NAME = "list_metadata_files"
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new ListMetadataFilesProcedure()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.util.HoodieTimer
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.metadata.HoodieBackedTableMetadata
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
|
||||
import java.util
|
||||
import java.util.Collections
|
||||
import java.util.function.Supplier
|
||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||
|
||||
class ListMetadataPartitionsProcedure() extends BaseProcedure with ProcedureBuilder with Logging {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val table = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
|
||||
val basePath = getBasePath(table)
|
||||
val config = HoodieMetadataConfig.newBuilder.enable(true).build
|
||||
val metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc),
|
||||
config, basePath, "/tmp")
|
||||
if (!metadata.enabled){
|
||||
throw new HoodieException(s"Metadata Table not enabled/initialized.")
|
||||
}
|
||||
|
||||
val timer = new HoodieTimer().startTimer
|
||||
val partitions = metadata.getAllPartitionPaths
|
||||
Collections.sort(partitions)
|
||||
logDebug("Took " + timer.endTimer + " ms")
|
||||
|
||||
val rows = new util.ArrayList[Row]
|
||||
partitions.stream.iterator().asScala.foreach((p: String) => {
|
||||
rows.add(Row(p))
|
||||
})
|
||||
rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList
|
||||
}
|
||||
|
||||
override def build: Procedure = new ListMetadataPartitionsProcedure()
|
||||
}
|
||||
|
||||
object ListMetadataPartitionsProcedure {
|
||||
val NAME = "list_metadata_partitions"
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new ListMetadataPartitionsProcedure()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.SparkAdapterSupport
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.HoodieTimer
|
||||
import org.apache.hudi.metadata.{HoodieTableMetadata, SparkHoodieBackedTableMetadataWriter}
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
import java.io.FileNotFoundException
|
||||
import java.util.function.Supplier
|
||||
|
||||
class MetadataCreateProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("result", DataTypes.StringType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val tableName = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
|
||||
val basePath = getBasePath(tableName)
|
||||
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||
val metadataPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(basePath))
|
||||
|
||||
try {
|
||||
val statuses = metaClient.getFs.listStatus(metadataPath)
|
||||
if (statuses.nonEmpty) {
|
||||
throw new RuntimeException("Metadata directory (" + metadataPath.toString + ") not empty.")
|
||||
}
|
||||
} catch {
|
||||
case e: FileNotFoundException =>
|
||||
// Metadata directory does not exist yet
|
||||
metaClient.getFs.mkdirs(metadataPath)
|
||||
}
|
||||
val timer = new HoodieTimer().startTimer
|
||||
val writeConfig = getWriteConfig(basePath)
|
||||
SparkHoodieBackedTableMetadataWriter.create(metaClient.getHadoopConf, writeConfig, new HoodieSparkEngineContext(jsc))
|
||||
Seq(Row("Created Metadata Table in " + metadataPath + " (duration=" + timer.endTimer / 1000.0 + "secs)"))
|
||||
}
|
||||
|
||||
override def build = new MetadataCreateProcedure()
|
||||
}
|
||||
|
||||
object MetadataCreateProcedure {
|
||||
val NAME = "metadata_create"
|
||||
var metadataBaseDirectory: Option[String] = None
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new MetadataCreateProcedure()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.SparkAdapterSupport
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.metadata.HoodieTableMetadata
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
import java.io.FileNotFoundException
|
||||
import java.util.function.Supplier
|
||||
|
||||
class MetadataDeleteProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("result", DataTypes.StringType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val tableName = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
val basePath = getBasePath(tableName)
|
||||
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||
val metadataPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(basePath))
|
||||
|
||||
try {
|
||||
val statuses = metaClient.getFs.listStatus(metadataPath)
|
||||
if (statuses.nonEmpty) metaClient.getFs.delete(metadataPath, true)
|
||||
} catch {
|
||||
case e: FileNotFoundException =>
|
||||
// Metadata directory does not exist
|
||||
}
|
||||
Seq(Row("Removed Metadata Table from " + metadataPath))
|
||||
}
|
||||
|
||||
override def build = new MetadataDeleteProcedure()
|
||||
}
|
||||
|
||||
object MetadataDeleteProcedure {
|
||||
val NAME = "metadata_delete"
|
||||
var metadataBaseDirectory: Option[String] = None
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new MetadataDeleteProcedure()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.SparkAdapterSupport
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.HoodieTimer
|
||||
import org.apache.hudi.metadata.{HoodieTableMetadata, SparkHoodieBackedTableMetadataWriter}
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
import java.io.FileNotFoundException
|
||||
import java.util.function.Supplier
|
||||
|
||||
class MetadataInitProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport with Logging {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None),
|
||||
ProcedureParameter.optional(1, "readOnly", DataTypes.BooleanType, false)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("result", DataTypes.StringType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val tableName = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
val readOnly = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean]
|
||||
|
||||
val basePath = getBasePath(tableName)
|
||||
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||
val metadataPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(basePath))
|
||||
try {
|
||||
metaClient.getFs.listStatus(metadataPath)
|
||||
} catch {
|
||||
case e: FileNotFoundException =>
|
||||
// Metadata directory does not exist yet
|
||||
throw new RuntimeException("Metadata directory (" + metadataPath.toString + ") does not exist.")
|
||||
}
|
||||
|
||||
val timer = new HoodieTimer().startTimer
|
||||
if (!readOnly) {
|
||||
val writeConfig = getWriteConfig(basePath)
|
||||
SparkHoodieBackedTableMetadataWriter.create(metaClient.getHadoopConf, writeConfig, new HoodieSparkEngineContext(jsc))
|
||||
}
|
||||
|
||||
val action = if (readOnly) "Opened" else "Initialized"
|
||||
Seq(Row(action + " Metadata Table in " + metadataPath + " (duration=" + timer.endTimer / 1000.0 + "sec)"))
|
||||
}
|
||||
|
||||
override def build = new MetadataInitProcedure()
|
||||
}
|
||||
|
||||
object MetadataInitProcedure {
|
||||
val NAME = "metadata_init"
|
||||
var metadataBaseDirectory: Option[String] = None
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new MetadataInitProcedure()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.metadata.HoodieBackedTableMetadata
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
|
||||
import java.util
|
||||
import java.util.function.Supplier
|
||||
import scala.collection.JavaConversions._
|
||||
|
||||
class ShowMetadataStatsProcedure() extends BaseProcedure with ProcedureBuilder {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("stat_key", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||
StructField("stat_value", DataTypes.StringType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val table = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
|
||||
val basePath = getBasePath(table)
|
||||
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||
val config = HoodieMetadataConfig.newBuilder.enable(true).build
|
||||
val metadata = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf),
|
||||
config, basePath, "/tmp")
|
||||
val stats = metadata.stats
|
||||
|
||||
val rows = new util.ArrayList[Row]
|
||||
for (entry <- stats.entrySet) {
|
||||
rows.add(Row(entry.getKey, entry.getValue))
|
||||
}
|
||||
rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList
|
||||
}
|
||||
|
||||
override def build: Procedure = new ShowMetadataStatsProcedure()
|
||||
}
|
||||
|
||||
object ShowMetadataStatsProcedure {
|
||||
val NAME = "show_metadata_stats"
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new ShowMetadataStatsProcedure()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.command.procedures
|
||||
|
||||
import org.apache.hadoop.fs.{FileStatus, Path}
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.HoodieTimer
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.metadata.HoodieBackedTableMetadata
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
|
||||
import java.util
|
||||
import java.util.Collections
|
||||
import java.util.function.Supplier
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.collection.JavaConverters.asScalaIteratorConverter
|
||||
|
||||
class ValidateMetadataFilesProcedure() extends BaseProcedure with ProcedureBuilder with Logging {
|
||||
private val PARAMETERS = Array[ProcedureParameter](
|
||||
ProcedureParameter.required(0, "table", DataTypes.StringType, None),
|
||||
ProcedureParameter.optional(1, "verbose", DataTypes.BooleanType, false)
|
||||
)
|
||||
|
||||
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||
StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||
StructField("file_name", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||
StructField("is_present_in_fs", DataTypes.BooleanType, nullable = true, Metadata.empty),
|
||||
StructField("is_resent_in_metadata", DataTypes.BooleanType, nullable = true, Metadata.empty),
|
||||
StructField("fs_size", DataTypes.LongType, nullable = true, Metadata.empty),
|
||||
StructField("metadata_size", DataTypes.LongType, nullable = true, Metadata.empty)
|
||||
))
|
||||
|
||||
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||
|
||||
def outputType: StructType = OUTPUT_TYPE
|
||||
|
||||
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||
super.checkArgs(PARAMETERS, args)
|
||||
|
||||
val table = getArgValueOrDefault(args, PARAMETERS(0))
|
||||
val verbose = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean]
|
||||
|
||||
val basePath = getBasePath(table)
|
||||
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||
val config = HoodieMetadataConfig.newBuilder.enable(true).build
|
||||
val metadataReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf),
|
||||
config, basePath, "/tmp")
|
||||
|
||||
if (!metadataReader.enabled){
|
||||
throw new HoodieException(s"Metadata Table not enabled/initialized.")
|
||||
}
|
||||
|
||||
val fsConfig = HoodieMetadataConfig.newBuilder.enable(false).build
|
||||
val fsMetaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf),
|
||||
fsConfig, basePath, "/tmp")
|
||||
|
||||
val timer = new HoodieTimer().startTimer
|
||||
val metadataPartitions = metadataReader.getAllPartitionPaths
|
||||
logDebug("Listing partitions Took " + timer.endTimer + " ms")
|
||||
val fsPartitions = fsMetaReader.getAllPartitionPaths
|
||||
Collections.sort(fsPartitions)
|
||||
Collections.sort(metadataPartitions)
|
||||
|
||||
val allPartitions = new util.HashSet[String]
|
||||
allPartitions.addAll(fsPartitions)
|
||||
allPartitions.addAll(metadataPartitions)
|
||||
|
||||
if (!fsPartitions.equals(metadataPartitions)) {
|
||||
logError("FS partition listing is not matching with metadata partition listing!")
|
||||
logError("All FS partitions: " + util.Arrays.toString(fsPartitions.toArray))
|
||||
logError("All Metadata partitions: " + util.Arrays.toString(metadataPartitions.toArray))
|
||||
}
|
||||
|
||||
val rows = new util.ArrayList[Row]
|
||||
for (partition <- allPartitions) {
|
||||
val fileStatusMap = new util.HashMap[String, FileStatus]
|
||||
val metadataFileStatusMap = new util.HashMap[String, FileStatus]
|
||||
val metadataStatuses = metadataReader.getAllFilesInPartition(new Path(basePath, partition))
|
||||
util.Arrays.stream(metadataStatuses).iterator().asScala.foreach((entry: FileStatus) => metadataFileStatusMap.put(entry.getPath.getName, entry))
|
||||
val fsStatuses = fsMetaReader.getAllFilesInPartition(new Path(basePath, partition))
|
||||
util.Arrays.stream(fsStatuses).iterator().asScala.foreach((entry: FileStatus) => fileStatusMap.put(entry.getPath.getName, entry))
|
||||
val allFiles = new util.HashSet[String]
|
||||
allFiles.addAll(fileStatusMap.keySet)
|
||||
allFiles.addAll(metadataFileStatusMap.keySet)
|
||||
for (file <- allFiles) {
|
||||
val fsFileStatus = fileStatusMap.get(file)
|
||||
val metaFileStatus = metadataFileStatusMap.get(file)
|
||||
val doesFsFileExists = fsFileStatus != null
|
||||
val doesMetadataFileExists = metaFileStatus != null
|
||||
val fsFileLength = if (doesFsFileExists) fsFileStatus.getLen else 0
|
||||
val metadataFileLength = if (doesMetadataFileExists) metaFileStatus.getLen else 0
|
||||
if (verbose) { // if verbose print all files
|
||||
rows.add(Row(partition, file, doesFsFileExists, doesMetadataFileExists, fsFileLength, metadataFileLength))
|
||||
} else if ((doesFsFileExists != doesMetadataFileExists) || (fsFileLength != metadataFileLength)) { // if non verbose, print only non matching files
|
||||
rows.add(Row(partition, file, doesFsFileExists, doesMetadataFileExists, fsFileLength, metadataFileLength))
|
||||
}
|
||||
}
|
||||
if (metadataStatuses.length != fsStatuses.length) {
|
||||
logError(" FS and metadata files count not matching for " + partition + ". FS files count " + fsStatuses.length + ", metadata base files count " + metadataStatuses.length)
|
||||
}
|
||||
for (entry <- fileStatusMap.entrySet) {
|
||||
if (!metadataFileStatusMap.containsKey(entry.getKey)) {
|
||||
logError("FS file not found in metadata " + entry.getKey)
|
||||
} else if (entry.getValue.getLen != metadataFileStatusMap.get(entry.getKey).getLen) {
|
||||
logError(" FS file size mismatch " + entry.getKey + ", size equality " + (entry.getValue.getLen == metadataFileStatusMap.get(entry.getKey).getLen) + ". FS size " + entry.getValue.getLen + ", metadata size " + metadataFileStatusMap.get(entry.getKey).getLen)
|
||||
}
|
||||
}
|
||||
for (entry <- metadataFileStatusMap.entrySet) {
|
||||
if (!fileStatusMap.containsKey(entry.getKey)) {
|
||||
logError("Metadata file not found in FS " + entry.getKey)
|
||||
} else if (entry.getValue.getLen != fileStatusMap.get(entry.getKey).getLen) {
|
||||
logError(" Metadata file size mismatch " + entry.getKey + ", size equality " + (entry.getValue.getLen == fileStatusMap.get(entry.getKey).getLen) + ". Metadata size " + entry.getValue.getLen + ", FS size " + metadataFileStatusMap.get(entry.getKey).getLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList
|
||||
}
|
||||
|
||||
override def build: Procedure = new ValidateMetadataFilesProcedure()
|
||||
}
|
||||
|
||||
object ValidateMetadataFilesProcedure {
|
||||
val NAME = "validate_metadata_files"
|
||||
|
||||
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||
override def get() = new ValidateMetadataFilesProcedure()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,262 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.procedure
|
||||
|
||||
import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase
|
||||
|
||||
class TestMetadataProcedure extends HoodieSparkSqlTestBase {
|
||||
|
||||
test("Test Call metadata_delete Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// delete the metadata
|
||||
val deleteResult = spark.sql(s"""call metadata_delete(table => '$tableName')""").collect()
|
||||
assertResult(1) {
|
||||
deleteResult.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call metadata_create Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// The first step is delete the metadata
|
||||
val deleteResult = spark.sql(s"""call metadata_delete(table => '$tableName')""").collect()
|
||||
assertResult(1) {
|
||||
deleteResult.length
|
||||
}
|
||||
|
||||
// The second step is create the metadata
|
||||
val createResult = spark.sql(s"""call metadata_create(table => '$tableName')""").collect()
|
||||
assertResult(1) {
|
||||
createResult.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call metadata_init Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// read only, no initialize
|
||||
val readResult = spark.sql(s"""call metadata_init(table => '$tableName', readOnly => true)""").collect()
|
||||
assertResult(1) {
|
||||
readResult.length
|
||||
}
|
||||
|
||||
// initialize metadata
|
||||
val initResult = spark.sql(s"""call metadata_init(table => '$tableName')""").collect()
|
||||
assertResult(1) {
|
||||
initResult.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call show_metadata_stats Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts',
|
||||
| hoodie.metadata.metrics.enable = 'true'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// collect metadata stats for table
|
||||
val metadataStats = spark.sql(s"""call show_metadata_stats(table => '$tableName')""").collect()
|
||||
assertResult(0) {
|
||||
metadataStats.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call list_metadata_partitions Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| partitioned by (ts)
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// collect metadata partitions for table
|
||||
val partitions = spark.sql(s"""call list_metadata_partitions(table => '$tableName')""").collect()
|
||||
assertResult(2) {
|
||||
partitions.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call list_metadata_files Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| partitioned by (ts)
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// collect metadata partitions for table
|
||||
val partitions = spark.sql(s"""call list_metadata_partitions(table => '$tableName')""").collect()
|
||||
assertResult(2) {
|
||||
partitions.length
|
||||
}
|
||||
|
||||
// collect metadata files for a partition of a table
|
||||
val partition = partitions(0).get(0).toString
|
||||
val filesResult = spark.sql(s"""call list_metadata_files(table => '$tableName', partition => '$partition')""").collect()
|
||||
assertResult(1) {
|
||||
filesResult.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Test Call validate_metadata_files Procedure") {
|
||||
withTempDir { tmp =>
|
||||
val tableName = generateTableName
|
||||
// create table
|
||||
spark.sql(
|
||||
s"""
|
||||
|create table $tableName (
|
||||
| id int,
|
||||
| name string,
|
||||
| price double,
|
||||
| ts long
|
||||
|) using hudi
|
||||
| location '${tmp.getCanonicalPath}/$tableName'
|
||||
| partitioned by (ts)
|
||||
| tblproperties (
|
||||
| primaryKey = 'id',
|
||||
| preCombineField = 'ts'
|
||||
| )
|
||||
""".stripMargin)
|
||||
// insert data to table
|
||||
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
|
||||
spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500")
|
||||
|
||||
// collect validate metadata files result
|
||||
val validateFilesResult = spark.sql(s"""call validate_metadata_files(table => '$tableName')""").collect()
|
||||
assertResult(0) {
|
||||
validateFilesResult.length
|
||||
}
|
||||
|
||||
// collect validate metadata files result with verbose
|
||||
val validateFilesVerboseResult = spark.sql(s"""call validate_metadata_files(table => '$tableName', verbose => true)""").collect()
|
||||
assertResult(2) {
|
||||
validateFilesVerboseResult.length
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user