[HUDI-3357] MVP implementation of BigQuerySyncTool (#5125)
Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
c19f505b5a
commit
20964df770
46
hudi-gcp/src/assembly/src.xml
Normal file
46
hudi-gcp/src/assembly/src.xml
Normal file
@@ -0,0 +1,46 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3 http://maven.apache.org/xsd/assembly-1.1.3.xsd">
|
||||
<id>jar-with-dependencies</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<dependencySets>
|
||||
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<unpack>true</unpack>
|
||||
<scope>runtime</scope>
|
||||
<excludes>
|
||||
<exclude>junit:junit</exclude>
|
||||
<exclude>com.google.code.findbugs:*</exclude>
|
||||
<exclude>org.apache.hbase:*</exclude>
|
||||
</excludes>
|
||||
</dependencySet>
|
||||
|
||||
<dependencySet>
|
||||
<unpack>true</unpack>
|
||||
<scope>provided</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
||||
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.gcp.bigquery;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into BigQuery.
|
||||
*/
|
||||
public class BigQuerySyncConfig implements Serializable {
|
||||
|
||||
public static String BIGQUERY_SYNC_PROJECT_ID = "hoodie.gcp.bigquery.sync.project_id";
|
||||
public static String BIGQUERY_SYNC_DATASET_NAME = "hoodie.gcp.bigquery.sync.dataset_name";
|
||||
public static String BIGQUERY_SYNC_DATASET_LOCATION = "hoodie.gcp.bigquery.sync.dataset_location";
|
||||
public static String BIGQUERY_SYNC_TABLE_NAME = "hoodie.gcp.bigquery.sync.table_name";
|
||||
public static String BIGQUERY_SYNC_SOURCE_URI = "hoodie.gcp.bigquery.sync.source_uri";
|
||||
public static String BIGQUERY_SYNC_SOURCE_URI_PREFIX = "hoodie.gcp.bigquery.sync.source_uri_prefix";
|
||||
public static String BIGQUERY_SYNC_SYNC_BASE_PATH = "hoodie.gcp.bigquery.sync.base_path";
|
||||
public static String BIGQUERY_SYNC_PARTITION_FIELDS = "hoodie.gcp.bigquery.sync.partition_fields";
|
||||
public static String BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = "hoodie.gcp.bigquery.sync.use_file_listing_from_metadata";
|
||||
public static String BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = "hoodie.gcp.bigquery.sync.assume_date_partitioning";
|
||||
|
||||
@Parameter(names = {"--project-id"}, description = "name of the target project in BigQuery", required = true)
|
||||
public String projectId;
|
||||
@Parameter(names = {"--dataset-name"}, description = "name of the target dataset in BigQuery", required = true)
|
||||
public String datasetName;
|
||||
@Parameter(names = {"--dataset-location"}, description = "location of the target dataset in BigQuery", required = true)
|
||||
public String datasetLocation;
|
||||
@Parameter(names = {"--table-name"}, description = "name of the target table in BigQuery", required = true)
|
||||
public String tableName;
|
||||
@Parameter(names = {"--source-uri"}, description = "name of the source uri gcs path of the table", required = true)
|
||||
public String sourceUri;
|
||||
@Parameter(names = {"--source-uri-prefix"}, description = "name of the source uri gcs path prefix of the table", required = true)
|
||||
public String sourceUriPrefix;
|
||||
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
@Parameter(names = {"--partitioned-by"}, description = "Comma-delimited partition fields. Default to non-partitioned.")
|
||||
public List<String> partitionFields = new ArrayList<>();
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata = false;
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning = false;
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
|
||||
public static BigQuerySyncConfig copy(BigQuerySyncConfig cfg) {
|
||||
BigQuerySyncConfig newConfig = new BigQuerySyncConfig();
|
||||
newConfig.projectId = cfg.projectId;
|
||||
newConfig.datasetName = cfg.datasetName;
|
||||
newConfig.datasetLocation = cfg.datasetLocation;
|
||||
newConfig.tableName = cfg.tableName;
|
||||
newConfig.sourceUri = cfg.sourceUri;
|
||||
newConfig.sourceUriPrefix = cfg.sourceUriPrefix;
|
||||
newConfig.basePath = cfg.basePath;
|
||||
newConfig.partitionFields = cfg.partitionFields;
|
||||
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
|
||||
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
|
||||
newConfig.help = cfg.help;
|
||||
return newConfig;
|
||||
}
|
||||
|
||||
public TypedProperties toProps() {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.put(BIGQUERY_SYNC_PROJECT_ID, projectId);
|
||||
properties.put(BIGQUERY_SYNC_DATASET_NAME, datasetName);
|
||||
properties.put(BIGQUERY_SYNC_DATASET_LOCATION, datasetLocation);
|
||||
properties.put(BIGQUERY_SYNC_TABLE_NAME, tableName);
|
||||
properties.put(BIGQUERY_SYNC_SOURCE_URI, sourceUri);
|
||||
properties.put(BIGQUERY_SYNC_SOURCE_URI_PREFIX, sourceUriPrefix);
|
||||
properties.put(BIGQUERY_SYNC_SYNC_BASE_PATH, basePath);
|
||||
properties.put(BIGQUERY_SYNC_PARTITION_FIELDS, String.join(",", partitionFields));
|
||||
properties.put(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, useFileListingFromMetadata);
|
||||
properties.put(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, assumeDatePartitioning);
|
||||
return properties;
|
||||
}
|
||||
|
||||
public static BigQuerySyncConfig fromProps(TypedProperties props) {
|
||||
BigQuerySyncConfig config = new BigQuerySyncConfig();
|
||||
config.projectId = props.getString(BIGQUERY_SYNC_PROJECT_ID);
|
||||
config.datasetName = props.getString(BIGQUERY_SYNC_DATASET_NAME);
|
||||
config.datasetLocation = props.getString(BIGQUERY_SYNC_DATASET_LOCATION);
|
||||
config.tableName = props.getString(BIGQUERY_SYNC_TABLE_NAME);
|
||||
config.sourceUri = props.getString(BIGQUERY_SYNC_SOURCE_URI);
|
||||
config.sourceUriPrefix = props.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX);
|
||||
config.basePath = props.getString(BIGQUERY_SYNC_SYNC_BASE_PATH);
|
||||
config.partitionFields = props.getStringList(BIGQUERY_SYNC_PARTITION_FIELDS, ",", Collections.emptyList());
|
||||
config.useFileListingFromMetadata = props.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, false);
|
||||
config.assumeDatePartitioning = props.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, false);
|
||||
return config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BigQuerySyncConfig{projectId='" + projectId
|
||||
+ "', datasetName='" + datasetName
|
||||
+ "', datasetLocation='" + datasetLocation
|
||||
+ "', tableName='" + tableName
|
||||
+ "', sourceUri='" + sourceUri
|
||||
+ "', sourceUriPrefix='" + sourceUriPrefix
|
||||
+ "', basePath='" + basePath + "'"
|
||||
+ ", partitionFields=" + partitionFields
|
||||
+ "', useFileListingFromMetadata='" + useFileListingFromMetadata
|
||||
+ "', assumeDataPartitioning='" + assumeDatePartitioning
|
||||
+ "', help=" + help + "}";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.gcp.bigquery;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.util.ManifestFileWriter;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Tool to sync a hoodie table with a big query table. Either use it as an api
|
||||
* BigQuerySyncTool.syncHoodieTable(BigQuerySyncConfig) or as a command line java -cp hoodie-hive.jar BigQuerySyncTool [args]
|
||||
* <p>
|
||||
* This utility will get the schema from the latest commit and will sync big query table schema.
|
||||
*
|
||||
* @Experimental
|
||||
*/
|
||||
public class BigQuerySyncTool extends AbstractSyncTool {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BigQuerySyncTool.class);
|
||||
|
||||
public final BigQuerySyncConfig cfg;
|
||||
public final String manifestTableName;
|
||||
public final String versionsTableName;
|
||||
public final String snapshotViewName;
|
||||
|
||||
public BigQuerySyncTool(TypedProperties properties, Configuration conf, FileSystem fs) {
|
||||
super(properties, conf, fs);
|
||||
cfg = BigQuerySyncConfig.fromProps(properties);
|
||||
manifestTableName = cfg.tableName + "_manifest";
|
||||
versionsTableName = cfg.tableName + "_versions";
|
||||
snapshotViewName = cfg.tableName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
try (HoodieBigQuerySyncClient bqSyncClient = new HoodieBigQuerySyncClient(BigQuerySyncConfig.fromProps(props), fs)) {
|
||||
switch (bqSyncClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
syncCoWTable(bqSyncClient);
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
default:
|
||||
throw new UnsupportedOperationException(bqSyncClient.getTableType() + " table type is not supported yet.");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieBigQuerySyncException("Got runtime exception when big query syncing " + cfg.tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void syncCoWTable(HoodieBigQuerySyncClient bqSyncClient) {
|
||||
ValidationUtils.checkState(bqSyncClient.getTableType() == HoodieTableType.COPY_ON_WRITE);
|
||||
LOG.info("Sync hoodie table " + snapshotViewName + " at base path " + bqSyncClient.getBasePath());
|
||||
|
||||
if (!bqSyncClient.datasetExists()) {
|
||||
throw new HoodieBigQuerySyncException("Dataset not found: " + cfg);
|
||||
}
|
||||
|
||||
ManifestFileWriter manifestFileWriter = ManifestFileWriter.builder()
|
||||
.setConf(conf)
|
||||
.setBasePath(cfg.basePath)
|
||||
.setUseFileListingFromMetadata(cfg.useFileListingFromMetadata)
|
||||
.setAssumeDatePartitioning(cfg.assumeDatePartitioning)
|
||||
.build();
|
||||
manifestFileWriter.writeManifestFile();
|
||||
|
||||
if (!bqSyncClient.tableExists(manifestTableName)) {
|
||||
bqSyncClient.createManifestTable(manifestTableName, manifestFileWriter.getManifestSourceUri());
|
||||
LOG.info("Manifest table creation complete for " + manifestTableName);
|
||||
}
|
||||
if (!bqSyncClient.tableExists(versionsTableName)) {
|
||||
bqSyncClient.createVersionsTable(versionsTableName, cfg.sourceUri, cfg.sourceUriPrefix, cfg.partitionFields);
|
||||
LOG.info("Versions table creation complete for " + versionsTableName);
|
||||
}
|
||||
if (!bqSyncClient.tableExists(snapshotViewName)) {
|
||||
bqSyncClient.createSnapshotView(snapshotViewName, versionsTableName, manifestTableName);
|
||||
LOG.info("Snapshot view creation complete for " + snapshotViewName);
|
||||
}
|
||||
|
||||
// TODO: Implement automatic schema evolution when you add a new column.
|
||||
LOG.info("Sync table complete for " + snapshotViewName);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
BigQuerySyncConfig cfg = new BigQuerySyncConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
new BigQuerySyncTool(cfg.toProps(), fs.getConf(), fs).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.gcp.bigquery;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
|
||||
import com.google.cloud.bigquery.BigQuery;
|
||||
import com.google.cloud.bigquery.BigQueryException;
|
||||
import com.google.cloud.bigquery.BigQueryOptions;
|
||||
import com.google.cloud.bigquery.CsvOptions;
|
||||
import com.google.cloud.bigquery.Dataset;
|
||||
import com.google.cloud.bigquery.DatasetId;
|
||||
import com.google.cloud.bigquery.ExternalTableDefinition;
|
||||
import com.google.cloud.bigquery.Field;
|
||||
import com.google.cloud.bigquery.FormatOptions;
|
||||
import com.google.cloud.bigquery.HivePartitioningOptions;
|
||||
import com.google.cloud.bigquery.Schema;
|
||||
import com.google.cloud.bigquery.StandardSQLTypeName;
|
||||
import com.google.cloud.bigquery.Table;
|
||||
import com.google.cloud.bigquery.TableId;
|
||||
import com.google.cloud.bigquery.TableInfo;
|
||||
import com.google.cloud.bigquery.ViewDefinition;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieBigQuerySyncClient.class);
|
||||
|
||||
private final BigQuerySyncConfig syncConfig;
|
||||
private transient BigQuery bigquery;
|
||||
|
||||
public HoodieBigQuerySyncClient(final BigQuerySyncConfig syncConfig, final FileSystem fs) {
|
||||
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata,
|
||||
false, fs);
|
||||
this.syncConfig = syncConfig;
|
||||
this.createBigQueryConnection();
|
||||
}
|
||||
|
||||
private void createBigQueryConnection() {
|
||||
if (bigquery == null) {
|
||||
try {
|
||||
// Initialize client that will be used to send requests. This client only needs to be created
|
||||
// once, and can be reused for multiple requests.
|
||||
bigquery = BigQueryOptions.newBuilder().setLocation(syncConfig.datasetLocation).build().getService();
|
||||
LOG.info("Successfully established BigQuery connection.");
|
||||
} catch (BigQueryException e) {
|
||||
throw new HoodieBigQuerySyncException("Cannot create bigQuery connection ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createTable(final String tableName, final MessageType storageSchema, final String inputFormatClass,
|
||||
final String outputFormatClass, final String serdeClass,
|
||||
final Map<String, String> serdeProperties, final Map<String, String> tableProperties) {
|
||||
// bigQuery create table arguments are different, so do nothing.
|
||||
}
|
||||
|
||||
public void createManifestTable(String tableName, String sourceUri) {
|
||||
try {
|
||||
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName);
|
||||
CsvOptions csvOptions = CsvOptions.newBuilder()
|
||||
.setFieldDelimiter(",")
|
||||
.setAllowJaggedRows(false)
|
||||
.setAllowQuotedNewLines(false)
|
||||
.setSkipLeadingRows(0)
|
||||
.build();
|
||||
Schema schema = Schema.of(
|
||||
Field.of("filename", StandardSQLTypeName.STRING));
|
||||
|
||||
ExternalTableDefinition customTable =
|
||||
ExternalTableDefinition.newBuilder(sourceUri, schema, csvOptions)
|
||||
.setAutodetect(false)
|
||||
.setIgnoreUnknownValues(false)
|
||||
.setMaxBadRecords(0)
|
||||
.build();
|
||||
bigquery.create(TableInfo.of(tableId, customTable));
|
||||
LOG.info("Manifest External table created.");
|
||||
} catch (BigQueryException e) {
|
||||
throw new HoodieBigQuerySyncException("Manifest External table was not created ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void createVersionsTable(String tableName, String sourceUri, String sourceUriPrefix, List<String> partitionFields) {
|
||||
try {
|
||||
ExternalTableDefinition customTable;
|
||||
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName);
|
||||
|
||||
if (partitionFields.isEmpty()) {
|
||||
customTable =
|
||||
ExternalTableDefinition.newBuilder(sourceUri, FormatOptions.parquet())
|
||||
.setAutodetect(true)
|
||||
.setIgnoreUnknownValues(true)
|
||||
.setMaxBadRecords(0)
|
||||
.build();
|
||||
} else {
|
||||
// Configuring partitioning options for partitioned table.
|
||||
HivePartitioningOptions hivePartitioningOptions =
|
||||
HivePartitioningOptions.newBuilder()
|
||||
.setMode("AUTO")
|
||||
.setRequirePartitionFilter(false)
|
||||
.setSourceUriPrefix(sourceUriPrefix)
|
||||
.build();
|
||||
customTable =
|
||||
ExternalTableDefinition.newBuilder(sourceUri, FormatOptions.parquet())
|
||||
.setAutodetect(true)
|
||||
.setHivePartitioningOptions(hivePartitioningOptions)
|
||||
.setIgnoreUnknownValues(true)
|
||||
.setMaxBadRecords(0)
|
||||
.build();
|
||||
}
|
||||
|
||||
bigquery.create(TableInfo.of(tableId, customTable));
|
||||
LOG.info("External table created using hivepartitioningoptions");
|
||||
} catch (BigQueryException e) {
|
||||
throw new HoodieBigQuerySyncException("External table was not created ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void createSnapshotView(String viewName, String versionsTableName, String manifestTableName) {
|
||||
try {
|
||||
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, viewName);
|
||||
String query =
|
||||
String.format(
|
||||
"SELECT * FROM `%s.%s.%s` WHERE _hoodie_file_name IN "
|
||||
+ "(SELECT filename FROM `%s.%s.%s`)",
|
||||
syncConfig.projectId,
|
||||
syncConfig.datasetName,
|
||||
versionsTableName,
|
||||
syncConfig.projectId,
|
||||
syncConfig.datasetName,
|
||||
manifestTableName);
|
||||
|
||||
ViewDefinition viewDefinition =
|
||||
ViewDefinition.newBuilder(query).setUseLegacySql(false).build();
|
||||
|
||||
bigquery.create(TableInfo.of(tableId, viewDefinition));
|
||||
LOG.info("View created successfully");
|
||||
} catch (BigQueryException e) {
|
||||
throw new HoodieBigQuerySyncException("View was not created ", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getTableSchema(String tableName) {
|
||||
// TODO: Implement automatic schema evolution when you add a new column.
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPartitionsToTable(final String tableName, final List<String> partitionsToAdd) {
|
||||
// bigQuery discovers the new partitions automatically, so do nothing.
|
||||
throw new UnsupportedOperationException("No support for addPartitionsToTable yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void dropPartitionsToTable(final String tableName, final List<String> partitionsToDrop) {
|
||||
// bigQuery discovers the new partitions automatically, so do nothing.
|
||||
throw new UnsupportedOperationException("No support for dropPartitionsToTable yet.");
|
||||
}
|
||||
|
||||
public boolean datasetExists() {
|
||||
Dataset dataset = bigquery.getDataset(DatasetId.of(syncConfig.projectId, syncConfig.datasetName));
|
||||
return dataset != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean doesTableExist(final String tableName) {
|
||||
return tableExists(tableName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tableExists(String tableName) {
|
||||
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName);
|
||||
Table table = bigquery.getTable(tableId, BigQuery.TableOption.fields());
|
||||
return table != null && table.exists();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastCommitTimeSynced(final String tableName) {
|
||||
// bigQuery doesn't support tblproperties, so do nothing.
|
||||
throw new UnsupportedOperationException("Not support getLastCommitTimeSynced yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateLastCommitTimeSynced(final String tableName) {
|
||||
// bigQuery doesn't support tblproperties, so do nothing.
|
||||
throw new UnsupportedOperationException("No support for updateLastCommitTimeSynced yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastReplicatedTime(String tableName) {
|
||||
// bigQuery doesn't support tblproperties, so do nothing.
|
||||
throw new UnsupportedOperationException("Not support getLastReplicatedTime yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
|
||||
// bigQuery doesn't support tblproperties, so do nothing.
|
||||
throw new UnsupportedOperationException("No support for updateLastReplicatedTimeStamp yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteLastReplicatedTimeStamp(String tableName) {
|
||||
// bigQuery doesn't support tblproperties, so do nothing.
|
||||
throw new UnsupportedOperationException("No support for deleteLastReplicatedTimeStamp yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updatePartitionsToTable(final String tableName, final List<String> changedPartitions) {
|
||||
// bigQuery updates the partitions automatically, so do nothing.
|
||||
throw new UnsupportedOperationException("No support for updatePartitionsToTable yet.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// bigQuery has no connection close method, so do nothing.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.gcp.bigquery;
|
||||
|
||||
public class HoodieBigQuerySyncException extends RuntimeException {
|
||||
|
||||
public HoodieBigQuerySyncException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public HoodieBigQuerySyncException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public HoodieBigQuerySyncException(String message, Throwable t) {
|
||||
super(message, t);
|
||||
}
|
||||
|
||||
public HoodieBigQuerySyncException(Throwable t) {
|
||||
super(t);
|
||||
}
|
||||
|
||||
protected static String format(String message, Object... args) {
|
||||
return String.format(String.valueOf(message), args);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.gcp.bigquery;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PARTITION_FIELDS;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PROJECT_ID;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI_PREFIX;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME;
|
||||
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class TestBigQuerySyncConfig {
|
||||
|
||||
BigQuerySyncConfig syncConfig;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
syncConfig = new BigQuerySyncConfig();
|
||||
syncConfig.projectId = "fooproject";
|
||||
syncConfig.datasetName = "foodataset";
|
||||
syncConfig.datasetLocation = "US";
|
||||
syncConfig.tableName = "footable";
|
||||
syncConfig.sourceUri = "gs://test-bucket/dwh/table_name/dt=*";
|
||||
syncConfig.sourceUriPrefix = "gs://test-bucket/dwh/table_name/";
|
||||
syncConfig.basePath = "gs://test-bucket/dwh/table_name";
|
||||
syncConfig.partitionFields = Arrays.asList("a", "b");
|
||||
syncConfig.useFileListingFromMetadata = true;
|
||||
syncConfig.assumeDatePartitioning = true;
|
||||
syncConfig.help = true;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCopy() {
|
||||
BigQuerySyncConfig copied = BigQuerySyncConfig.copy(syncConfig);
|
||||
assertEquals(copied.partitionFields, syncConfig.partitionFields);
|
||||
assertEquals(copied.basePath, syncConfig.basePath);
|
||||
assertEquals(copied.projectId, syncConfig.projectId);
|
||||
assertEquals(copied.datasetName, syncConfig.datasetName);
|
||||
assertEquals(copied.datasetLocation, syncConfig.datasetLocation);
|
||||
assertEquals(copied.tableName, syncConfig.tableName);
|
||||
assertEquals(copied.sourceUri, syncConfig.sourceUri);
|
||||
assertEquals(copied.sourceUriPrefix, syncConfig.sourceUriPrefix);
|
||||
assertEquals(copied.useFileListingFromMetadata, syncConfig.useFileListingFromMetadata);
|
||||
assertEquals(copied.assumeDatePartitioning, syncConfig.assumeDatePartitioning);
|
||||
assertEquals(copied.help, syncConfig.help);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testToProps() {
|
||||
TypedProperties props = syncConfig.toProps();
|
||||
assertEquals("fooproject", props.getString(BIGQUERY_SYNC_PROJECT_ID));
|
||||
assertEquals("foodataset", props.getString(BIGQUERY_SYNC_DATASET_NAME));
|
||||
assertEquals("US", props.getString(BIGQUERY_SYNC_DATASET_LOCATION));
|
||||
assertEquals("footable", props.getString(BIGQUERY_SYNC_TABLE_NAME));
|
||||
assertEquals("gs://test-bucket/dwh/table_name/dt=*", props.getString(BIGQUERY_SYNC_SOURCE_URI));
|
||||
assertEquals("gs://test-bucket/dwh/table_name/", props.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX));
|
||||
assertEquals("gs://test-bucket/dwh/table_name", props.getString(BIGQUERY_SYNC_SYNC_BASE_PATH));
|
||||
assertEquals("a,b", props.getString(BIGQUERY_SYNC_PARTITION_FIELDS));
|
||||
assertEquals("true", props.getString(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
|
||||
assertEquals("true", props.getString(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fromProps() {
|
||||
TypedProperties props = new TypedProperties();
|
||||
props.put(BIGQUERY_SYNC_PROJECT_ID, "fooproject");
|
||||
props.put(BIGQUERY_SYNC_DATASET_NAME, "foodataset");
|
||||
props.put(BIGQUERY_SYNC_DATASET_LOCATION, "US");
|
||||
props.put(BIGQUERY_SYNC_TABLE_NAME, "footable");
|
||||
props.put(BIGQUERY_SYNC_SOURCE_URI, "gs://test-bucket/dwh/table_name/dt=*");
|
||||
props.put(BIGQUERY_SYNC_SOURCE_URI_PREFIX, "gs://test-bucket/dwh/table_name/");
|
||||
props.put(BIGQUERY_SYNC_SYNC_BASE_PATH, "gs://test-bucket/dwh/table_name");
|
||||
props.put(BIGQUERY_SYNC_PARTITION_FIELDS, "a,b");
|
||||
props.put(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, true);
|
||||
props.put(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, true);
|
||||
BigQuerySyncConfig cfg = BigQuerySyncConfig.fromProps(props);
|
||||
|
||||
assertEquals(syncConfig.projectId, cfg.projectId);
|
||||
assertEquals(syncConfig.datasetName, cfg.datasetName);
|
||||
assertEquals(syncConfig.datasetLocation, cfg.datasetLocation);
|
||||
assertEquals(syncConfig.tableName, cfg.tableName);
|
||||
assertEquals(syncConfig.sourceUri, cfg.sourceUri);
|
||||
assertEquals(syncConfig.sourceUriPrefix, cfg.sourceUriPrefix);
|
||||
assertEquals(syncConfig.basePath, cfg.basePath);
|
||||
assertEquals(syncConfig.partitionFields, cfg.partitionFields);
|
||||
assertEquals(syncConfig.useFileListingFromMetadata, cfg.useFileListingFromMetadata);
|
||||
assertEquals(syncConfig.assumeDatePartitioning, cfg.assumeDatePartitioning);
|
||||
}
|
||||
}
|
||||
29
hudi-gcp/src/test/resources/log4j-surefire-quiet.properties
Normal file
29
hudi-gcp/src/test/resources/log4j-surefire-quiet.properties
Normal file
@@ -0,0 +1,29 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=ERROR, CONSOLE
|
||||
log4j.logger.org.apache.hudi=ERROR
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
29
hudi-gcp/src/test/resources/log4j-surefire.properties
Normal file
29
hudi-gcp/src/test/resources/log4j-surefire.properties
Normal file
@@ -0,0 +1,29 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=INFO
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
Reference in New Issue
Block a user