1
0

Improving out of box experience for data source

- Fixes #246
 - Bump up default parallelism to 1500, to handle large upserts
 - Add docs on s3 confuration & tuning tips with tested spark knobs
 - Fix bug to not duplicate hoodie metadata fields when input dataframe is another hoodie dataset
 - Improve speed of ROTablePathFilter by removing directory check
 - Move to spark-avro 4.0 to handle issue with nested fields with same name
 - Keep AvroConversionUtils in sync with spark-avro 4.0
This commit is contained in:
Vinoth Chandar
2018-01-05 14:06:18 -08:00
committed by vinoth chandar
parent a97814462d
commit 85dd265b7b
8 changed files with 112 additions and 19 deletions

View File

@@ -20,7 +20,6 @@ import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.DatasetNotFoundException;
import com.uber.hoodie.exception.HoodieException;
import java.io.Serializable;
@@ -61,6 +60,9 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
private HashSet<String> nonHoodiePathCache;
private transient FileSystem fs;
public HoodieROTablePathFilter() {
hoodiePathCache = new HashMap<>();
nonHoodiePathCache = new HashSet<>();
@@ -79,7 +81,6 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
return null;
}
@Override
public boolean accept(Path path) {
@@ -88,9 +89,8 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
}
Path folder = null;
try {
FileSystem fs = path.getFileSystem(FSUtils.prepareHadoopConf(new Configuration()));
if (fs.isDirectory(path)) {
return true;
if (fs == null) {
fs = path.getFileSystem(new Configuration());
}
// Assumes path is a file