Improving out of box experience for data source
- Fixes #246 - Bump up default parallelism to 1500, to handle large upserts - Add docs on s3 confuration & tuning tips with tested spark knobs - Fix bug to not duplicate hoodie metadata fields when input dataframe is another hoodie dataset - Improve speed of ROTablePathFilter by removing directory check - Move to spark-avro 4.0 to handle issue with nested fields with same name - Keep AvroConversionUtils in sync with spark-avro 4.0
This commit is contained in:
committed by
vinoth chandar
parent
a97814462d
commit
85dd265b7b
@@ -20,7 +20,6 @@ import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.DatasetNotFoundException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import java.io.Serializable;
|
||||
@@ -61,6 +60,9 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
|
||||
private HashSet<String> nonHoodiePathCache;
|
||||
|
||||
|
||||
private transient FileSystem fs;
|
||||
|
||||
|
||||
public HoodieROTablePathFilter() {
|
||||
hoodiePathCache = new HashMap<>();
|
||||
nonHoodiePathCache = new HashSet<>();
|
||||
@@ -79,7 +81,6 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean accept(Path path) {
|
||||
|
||||
@@ -88,9 +89,8 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
|
||||
}
|
||||
Path folder = null;
|
||||
try {
|
||||
FileSystem fs = path.getFileSystem(FSUtils.prepareHadoopConf(new Configuration()));
|
||||
if (fs.isDirectory(path)) {
|
||||
return true;
|
||||
if (fs == null) {
|
||||
fs = path.getFileSystem(new Configuration());
|
||||
}
|
||||
|
||||
// Assumes path is a file
|
||||
|
||||
Reference in New Issue
Block a user