1
0

[HUDI-3357] MVP implementation of BigQuerySyncTool (#5125)

Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
Vinoth Govindarajan
2022-04-02 13:18:06 -07:00
committed by GitHub
parent c19f505b5a
commit 20964df770
16 changed files with 1328 additions and 214 deletions

View File

@@ -0,0 +1,46 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3 http://maven.apache.org/xsd/assembly-1.1.3.xsd">
<id>jar-with-dependencies</id>
<formats>
<format>jar</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<outputDirectory>/</outputDirectory>
<unpack>true</unpack>
<scope>runtime</scope>
<excludes>
<exclude>junit:junit</exclude>
<exclude>com.google.code.findbugs:*</exclude>
<exclude>org.apache.hbase:*</exclude>
</excludes>
</dependencySet>
<dependencySet>
<unpack>true</unpack>
<scope>provided</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@@ -0,0 +1,131 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.TypedProperties;
import com.beust.jcommander.Parameter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Configs needed to sync data into BigQuery.
*/
public class BigQuerySyncConfig implements Serializable {
public static String BIGQUERY_SYNC_PROJECT_ID = "hoodie.gcp.bigquery.sync.project_id";
public static String BIGQUERY_SYNC_DATASET_NAME = "hoodie.gcp.bigquery.sync.dataset_name";
public static String BIGQUERY_SYNC_DATASET_LOCATION = "hoodie.gcp.bigquery.sync.dataset_location";
public static String BIGQUERY_SYNC_TABLE_NAME = "hoodie.gcp.bigquery.sync.table_name";
public static String BIGQUERY_SYNC_SOURCE_URI = "hoodie.gcp.bigquery.sync.source_uri";
public static String BIGQUERY_SYNC_SOURCE_URI_PREFIX = "hoodie.gcp.bigquery.sync.source_uri_prefix";
public static String BIGQUERY_SYNC_SYNC_BASE_PATH = "hoodie.gcp.bigquery.sync.base_path";
public static String BIGQUERY_SYNC_PARTITION_FIELDS = "hoodie.gcp.bigquery.sync.partition_fields";
public static String BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = "hoodie.gcp.bigquery.sync.use_file_listing_from_metadata";
public static String BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = "hoodie.gcp.bigquery.sync.assume_date_partitioning";
@Parameter(names = {"--project-id"}, description = "name of the target project in BigQuery", required = true)
public String projectId;
@Parameter(names = {"--dataset-name"}, description = "name of the target dataset in BigQuery", required = true)
public String datasetName;
@Parameter(names = {"--dataset-location"}, description = "location of the target dataset in BigQuery", required = true)
public String datasetLocation;
@Parameter(names = {"--table-name"}, description = "name of the target table in BigQuery", required = true)
public String tableName;
@Parameter(names = {"--source-uri"}, description = "name of the source uri gcs path of the table", required = true)
public String sourceUri;
@Parameter(names = {"--source-uri-prefix"}, description = "name of the source uri gcs path prefix of the table", required = true)
public String sourceUriPrefix;
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
public String basePath;
@Parameter(names = {"--partitioned-by"}, description = "Comma-delimited partition fields. Default to non-partitioned.")
public List<String> partitionFields = new ArrayList<>();
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
public Boolean useFileListingFromMetadata = false;
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
public Boolean assumeDatePartitioning = false;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
public static BigQuerySyncConfig copy(BigQuerySyncConfig cfg) {
BigQuerySyncConfig newConfig = new BigQuerySyncConfig();
newConfig.projectId = cfg.projectId;
newConfig.datasetName = cfg.datasetName;
newConfig.datasetLocation = cfg.datasetLocation;
newConfig.tableName = cfg.tableName;
newConfig.sourceUri = cfg.sourceUri;
newConfig.sourceUriPrefix = cfg.sourceUriPrefix;
newConfig.basePath = cfg.basePath;
newConfig.partitionFields = cfg.partitionFields;
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
newConfig.help = cfg.help;
return newConfig;
}
public TypedProperties toProps() {
TypedProperties properties = new TypedProperties();
properties.put(BIGQUERY_SYNC_PROJECT_ID, projectId);
properties.put(BIGQUERY_SYNC_DATASET_NAME, datasetName);
properties.put(BIGQUERY_SYNC_DATASET_LOCATION, datasetLocation);
properties.put(BIGQUERY_SYNC_TABLE_NAME, tableName);
properties.put(BIGQUERY_SYNC_SOURCE_URI, sourceUri);
properties.put(BIGQUERY_SYNC_SOURCE_URI_PREFIX, sourceUriPrefix);
properties.put(BIGQUERY_SYNC_SYNC_BASE_PATH, basePath);
properties.put(BIGQUERY_SYNC_PARTITION_FIELDS, String.join(",", partitionFields));
properties.put(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, useFileListingFromMetadata);
properties.put(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, assumeDatePartitioning);
return properties;
}
public static BigQuerySyncConfig fromProps(TypedProperties props) {
BigQuerySyncConfig config = new BigQuerySyncConfig();
config.projectId = props.getString(BIGQUERY_SYNC_PROJECT_ID);
config.datasetName = props.getString(BIGQUERY_SYNC_DATASET_NAME);
config.datasetLocation = props.getString(BIGQUERY_SYNC_DATASET_LOCATION);
config.tableName = props.getString(BIGQUERY_SYNC_TABLE_NAME);
config.sourceUri = props.getString(BIGQUERY_SYNC_SOURCE_URI);
config.sourceUriPrefix = props.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX);
config.basePath = props.getString(BIGQUERY_SYNC_SYNC_BASE_PATH);
config.partitionFields = props.getStringList(BIGQUERY_SYNC_PARTITION_FIELDS, ",", Collections.emptyList());
config.useFileListingFromMetadata = props.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, false);
config.assumeDatePartitioning = props.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, false);
return config;
}
@Override
public String toString() {
return "BigQuerySyncConfig{projectId='" + projectId
+ "', datasetName='" + datasetName
+ "', datasetLocation='" + datasetLocation
+ "', tableName='" + tableName
+ "', sourceUri='" + sourceUri
+ "', sourceUriPrefix='" + sourceUriPrefix
+ "', basePath='" + basePath + "'"
+ ", partitionFields=" + partitionFields
+ "', useFileListingFromMetadata='" + useFileListingFromMetadata
+ "', assumeDataPartitioning='" + assumeDatePartitioning
+ "', help=" + help + "}";
}
}

View File

@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hudi.sync.common.util.ManifestFileWriter;
import com.beust.jcommander.JCommander;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* Tool to sync a hoodie table with a big query table. Either use it as an api
* BigQuerySyncTool.syncHoodieTable(BigQuerySyncConfig) or as a command line java -cp hoodie-hive.jar BigQuerySyncTool [args]
* <p>
* This utility will get the schema from the latest commit and will sync big query table schema.
*
* @Experimental
*/
public class BigQuerySyncTool extends AbstractSyncTool {
private static final Logger LOG = LogManager.getLogger(BigQuerySyncTool.class);
public final BigQuerySyncConfig cfg;
public final String manifestTableName;
public final String versionsTableName;
public final String snapshotViewName;
public BigQuerySyncTool(TypedProperties properties, Configuration conf, FileSystem fs) {
super(properties, conf, fs);
cfg = BigQuerySyncConfig.fromProps(properties);
manifestTableName = cfg.tableName + "_manifest";
versionsTableName = cfg.tableName + "_versions";
snapshotViewName = cfg.tableName;
}
@Override
public void syncHoodieTable() {
try (HoodieBigQuerySyncClient bqSyncClient = new HoodieBigQuerySyncClient(BigQuerySyncConfig.fromProps(props), fs)) {
switch (bqSyncClient.getTableType()) {
case COPY_ON_WRITE:
syncCoWTable(bqSyncClient);
break;
case MERGE_ON_READ:
default:
throw new UnsupportedOperationException(bqSyncClient.getTableType() + " table type is not supported yet.");
}
} catch (Exception e) {
throw new HoodieBigQuerySyncException("Got runtime exception when big query syncing " + cfg.tableName, e);
}
}
private void syncCoWTable(HoodieBigQuerySyncClient bqSyncClient) {
ValidationUtils.checkState(bqSyncClient.getTableType() == HoodieTableType.COPY_ON_WRITE);
LOG.info("Sync hoodie table " + snapshotViewName + " at base path " + bqSyncClient.getBasePath());
if (!bqSyncClient.datasetExists()) {
throw new HoodieBigQuerySyncException("Dataset not found: " + cfg);
}
ManifestFileWriter manifestFileWriter = ManifestFileWriter.builder()
.setConf(conf)
.setBasePath(cfg.basePath)
.setUseFileListingFromMetadata(cfg.useFileListingFromMetadata)
.setAssumeDatePartitioning(cfg.assumeDatePartitioning)
.build();
manifestFileWriter.writeManifestFile();
if (!bqSyncClient.tableExists(manifestTableName)) {
bqSyncClient.createManifestTable(manifestTableName, manifestFileWriter.getManifestSourceUri());
LOG.info("Manifest table creation complete for " + manifestTableName);
}
if (!bqSyncClient.tableExists(versionsTableName)) {
bqSyncClient.createVersionsTable(versionsTableName, cfg.sourceUri, cfg.sourceUriPrefix, cfg.partitionFields);
LOG.info("Versions table creation complete for " + versionsTableName);
}
if (!bqSyncClient.tableExists(snapshotViewName)) {
bqSyncClient.createSnapshotView(snapshotViewName, versionsTableName, manifestTableName);
LOG.info("Snapshot view creation complete for " + snapshotViewName);
}
// TODO: Implement automatic schema evolution when you add a new column.
LOG.info("Sync table complete for " + snapshotViewName);
}
public static void main(String[] args) {
BigQuerySyncConfig cfg = new BigQuerySyncConfig();
JCommander cmd = new JCommander(cfg, null, args);
if (cfg.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
new BigQuerySyncTool(cfg.toProps(), fs.getConf(), fs).syncHoodieTable();
}
}

View File

@@ -0,0 +1,243 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.CsvOptions;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.ExternalTableDefinition;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FormatOptions;
import com.google.cloud.bigquery.HivePartitioningOptions;
import com.google.cloud.bigquery.Schema;
import com.google.cloud.bigquery.StandardSQLTypeName;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TableInfo;
import com.google.cloud.bigquery.ViewDefinition;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.schema.MessageType;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public class HoodieBigQuerySyncClient extends AbstractSyncHoodieClient {
private static final Logger LOG = LogManager.getLogger(HoodieBigQuerySyncClient.class);
private final BigQuerySyncConfig syncConfig;
private transient BigQuery bigquery;
public HoodieBigQuerySyncClient(final BigQuerySyncConfig syncConfig, final FileSystem fs) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata,
false, fs);
this.syncConfig = syncConfig;
this.createBigQueryConnection();
}
private void createBigQueryConnection() {
if (bigquery == null) {
try {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests.
bigquery = BigQueryOptions.newBuilder().setLocation(syncConfig.datasetLocation).build().getService();
LOG.info("Successfully established BigQuery connection.");
} catch (BigQueryException e) {
throw new HoodieBigQuerySyncException("Cannot create bigQuery connection ", e);
}
}
}
@Override
public void createTable(final String tableName, final MessageType storageSchema, final String inputFormatClass,
final String outputFormatClass, final String serdeClass,
final Map<String, String> serdeProperties, final Map<String, String> tableProperties) {
// bigQuery create table arguments are different, so do nothing.
}
public void createManifestTable(String tableName, String sourceUri) {
try {
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName);
CsvOptions csvOptions = CsvOptions.newBuilder()
.setFieldDelimiter(",")
.setAllowJaggedRows(false)
.setAllowQuotedNewLines(false)
.setSkipLeadingRows(0)
.build();
Schema schema = Schema.of(
Field.of("filename", StandardSQLTypeName.STRING));
ExternalTableDefinition customTable =
ExternalTableDefinition.newBuilder(sourceUri, schema, csvOptions)
.setAutodetect(false)
.setIgnoreUnknownValues(false)
.setMaxBadRecords(0)
.build();
bigquery.create(TableInfo.of(tableId, customTable));
LOG.info("Manifest External table created.");
} catch (BigQueryException e) {
throw new HoodieBigQuerySyncException("Manifest External table was not created ", e);
}
}
public void createVersionsTable(String tableName, String sourceUri, String sourceUriPrefix, List<String> partitionFields) {
try {
ExternalTableDefinition customTable;
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName);
if (partitionFields.isEmpty()) {
customTable =
ExternalTableDefinition.newBuilder(sourceUri, FormatOptions.parquet())
.setAutodetect(true)
.setIgnoreUnknownValues(true)
.setMaxBadRecords(0)
.build();
} else {
// Configuring partitioning options for partitioned table.
HivePartitioningOptions hivePartitioningOptions =
HivePartitioningOptions.newBuilder()
.setMode("AUTO")
.setRequirePartitionFilter(false)
.setSourceUriPrefix(sourceUriPrefix)
.build();
customTable =
ExternalTableDefinition.newBuilder(sourceUri, FormatOptions.parquet())
.setAutodetect(true)
.setHivePartitioningOptions(hivePartitioningOptions)
.setIgnoreUnknownValues(true)
.setMaxBadRecords(0)
.build();
}
bigquery.create(TableInfo.of(tableId, customTable));
LOG.info("External table created using hivepartitioningoptions");
} catch (BigQueryException e) {
throw new HoodieBigQuerySyncException("External table was not created ", e);
}
}
public void createSnapshotView(String viewName, String versionsTableName, String manifestTableName) {
try {
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, viewName);
String query =
String.format(
"SELECT * FROM `%s.%s.%s` WHERE _hoodie_file_name IN "
+ "(SELECT filename FROM `%s.%s.%s`)",
syncConfig.projectId,
syncConfig.datasetName,
versionsTableName,
syncConfig.projectId,
syncConfig.datasetName,
manifestTableName);
ViewDefinition viewDefinition =
ViewDefinition.newBuilder(query).setUseLegacySql(false).build();
bigquery.create(TableInfo.of(tableId, viewDefinition));
LOG.info("View created successfully");
} catch (BigQueryException e) {
throw new HoodieBigQuerySyncException("View was not created ", e);
}
}
@Override
public Map<String, String> getTableSchema(String tableName) {
// TODO: Implement automatic schema evolution when you add a new column.
return Collections.emptyMap();
}
@Override
public void addPartitionsToTable(final String tableName, final List<String> partitionsToAdd) {
// bigQuery discovers the new partitions automatically, so do nothing.
throw new UnsupportedOperationException("No support for addPartitionsToTable yet.");
}
@Override
public void dropPartitionsToTable(final String tableName, final List<String> partitionsToDrop) {
// bigQuery discovers the new partitions automatically, so do nothing.
throw new UnsupportedOperationException("No support for dropPartitionsToTable yet.");
}
public boolean datasetExists() {
Dataset dataset = bigquery.getDataset(DatasetId.of(syncConfig.projectId, syncConfig.datasetName));
return dataset != null;
}
@Override
public boolean doesTableExist(final String tableName) {
return tableExists(tableName);
}
@Override
public boolean tableExists(String tableName) {
TableId tableId = TableId.of(syncConfig.projectId, syncConfig.datasetName, tableName);
Table table = bigquery.getTable(tableId, BigQuery.TableOption.fields());
return table != null && table.exists();
}
@Override
public Option<String> getLastCommitTimeSynced(final String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("Not support getLastCommitTimeSynced yet.");
}
@Override
public void updateLastCommitTimeSynced(final String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("No support for updateLastCommitTimeSynced yet.");
}
@Override
public Option<String> getLastReplicatedTime(String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("Not support getLastReplicatedTime yet.");
}
@Override
public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("No support for updateLastReplicatedTimeStamp yet.");
}
@Override
public void deleteLastReplicatedTimeStamp(String tableName) {
// bigQuery doesn't support tblproperties, so do nothing.
throw new UnsupportedOperationException("No support for deleteLastReplicatedTimeStamp yet.");
}
@Override
public void updatePartitionsToTable(final String tableName, final List<String> changedPartitions) {
// bigQuery updates the partitions automatically, so do nothing.
throw new UnsupportedOperationException("No support for updatePartitionsToTable yet.");
}
@Override
public void close() {
// bigQuery has no connection close method, so do nothing.
}
}

View File

@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.gcp.bigquery;
public class HoodieBigQuerySyncException extends RuntimeException {
public HoodieBigQuerySyncException() {
super();
}
public HoodieBigQuerySyncException(String message) {
super(message);
}
public HoodieBigQuerySyncException(String message, Throwable t) {
super(message, t);
}
public HoodieBigQuerySyncException(Throwable t) {
super(t);
}
protected static String format(String message, Object... args) {
return String.format(String.valueOf(message), args);
}
}

View File

@@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.TypedProperties;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PROJECT_ID;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI_PREFIX;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC_BASE_PATH;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestBigQuerySyncConfig {
BigQuerySyncConfig syncConfig;
@BeforeEach
void setUp() {
syncConfig = new BigQuerySyncConfig();
syncConfig.projectId = "fooproject";
syncConfig.datasetName = "foodataset";
syncConfig.datasetLocation = "US";
syncConfig.tableName = "footable";
syncConfig.sourceUri = "gs://test-bucket/dwh/table_name/dt=*";
syncConfig.sourceUriPrefix = "gs://test-bucket/dwh/table_name/";
syncConfig.basePath = "gs://test-bucket/dwh/table_name";
syncConfig.partitionFields = Arrays.asList("a", "b");
syncConfig.useFileListingFromMetadata = true;
syncConfig.assumeDatePartitioning = true;
syncConfig.help = true;
}
@Test
public void testCopy() {
BigQuerySyncConfig copied = BigQuerySyncConfig.copy(syncConfig);
assertEquals(copied.partitionFields, syncConfig.partitionFields);
assertEquals(copied.basePath, syncConfig.basePath);
assertEquals(copied.projectId, syncConfig.projectId);
assertEquals(copied.datasetName, syncConfig.datasetName);
assertEquals(copied.datasetLocation, syncConfig.datasetLocation);
assertEquals(copied.tableName, syncConfig.tableName);
assertEquals(copied.sourceUri, syncConfig.sourceUri);
assertEquals(copied.sourceUriPrefix, syncConfig.sourceUriPrefix);
assertEquals(copied.useFileListingFromMetadata, syncConfig.useFileListingFromMetadata);
assertEquals(copied.assumeDatePartitioning, syncConfig.assumeDatePartitioning);
assertEquals(copied.help, syncConfig.help);
}
@Test
public void testToProps() {
TypedProperties props = syncConfig.toProps();
assertEquals("fooproject", props.getString(BIGQUERY_SYNC_PROJECT_ID));
assertEquals("foodataset", props.getString(BIGQUERY_SYNC_DATASET_NAME));
assertEquals("US", props.getString(BIGQUERY_SYNC_DATASET_LOCATION));
assertEquals("footable", props.getString(BIGQUERY_SYNC_TABLE_NAME));
assertEquals("gs://test-bucket/dwh/table_name/dt=*", props.getString(BIGQUERY_SYNC_SOURCE_URI));
assertEquals("gs://test-bucket/dwh/table_name/", props.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX));
assertEquals("gs://test-bucket/dwh/table_name", props.getString(BIGQUERY_SYNC_SYNC_BASE_PATH));
assertEquals("a,b", props.getString(BIGQUERY_SYNC_PARTITION_FIELDS));
assertEquals("true", props.getString(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
assertEquals("true", props.getString(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
}
@Test
public void fromProps() {
TypedProperties props = new TypedProperties();
props.put(BIGQUERY_SYNC_PROJECT_ID, "fooproject");
props.put(BIGQUERY_SYNC_DATASET_NAME, "foodataset");
props.put(BIGQUERY_SYNC_DATASET_LOCATION, "US");
props.put(BIGQUERY_SYNC_TABLE_NAME, "footable");
props.put(BIGQUERY_SYNC_SOURCE_URI, "gs://test-bucket/dwh/table_name/dt=*");
props.put(BIGQUERY_SYNC_SOURCE_URI_PREFIX, "gs://test-bucket/dwh/table_name/");
props.put(BIGQUERY_SYNC_SYNC_BASE_PATH, "gs://test-bucket/dwh/table_name");
props.put(BIGQUERY_SYNC_PARTITION_FIELDS, "a,b");
props.put(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA, true);
props.put(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING, true);
BigQuerySyncConfig cfg = BigQuerySyncConfig.fromProps(props);
assertEquals(syncConfig.projectId, cfg.projectId);
assertEquals(syncConfig.datasetName, cfg.datasetName);
assertEquals(syncConfig.datasetLocation, cfg.datasetLocation);
assertEquals(syncConfig.tableName, cfg.tableName);
assertEquals(syncConfig.sourceUri, cfg.sourceUri);
assertEquals(syncConfig.sourceUriPrefix, cfg.sourceUriPrefix);
assertEquals(syncConfig.basePath, cfg.basePath);
assertEquals(syncConfig.partitionFields, cfg.partitionFields);
assertEquals(syncConfig.useFileListingFromMetadata, cfg.useFileListingFromMetadata);
assertEquals(syncConfig.assumeDatePartitioning, cfg.assumeDatePartitioning);
}
}

View File

@@ -0,0 +1,29 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=ERROR, CONSOLE
log4j.logger.org.apache.hudi=ERROR
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# CONSOLE uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -0,0 +1,29 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=INFO
# A1 is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL