- Fix realtime queries by removing COLUMN_ID and COLUMN_NAME cache in inputformat (#814)
- Hive on Spark will NOT work for RT tables after this patch
This commit is contained in:
@@ -69,15 +69,10 @@ public class HoodieRealtimeInputFormat extends HoodieInputFormat implements Conf
|
|||||||
public static final int HOODIE_COMMIT_TIME_COL_POS = 0;
|
public static final int HOODIE_COMMIT_TIME_COL_POS = 0;
|
||||||
public static final int HOODIE_RECORD_KEY_COL_POS = 2;
|
public static final int HOODIE_RECORD_KEY_COL_POS = 2;
|
||||||
public static final int HOODIE_PARTITION_PATH_COL_POS = 3;
|
public static final int HOODIE_PARTITION_PATH_COL_POS = 3;
|
||||||
// Track the read column ids and names to be used throughout the execution and lifetime of this task
|
// Hive on Spark queries do not work with RT tables. Our theory is that due to
|
||||||
// Needed for Hive on Spark. Our theory is that due to
|
|
||||||
// {@link org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher}
|
// {@link org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher}
|
||||||
// not handling empty list correctly, the ParquetRecordReaderWrapper ends up adding the same column ids multiple
|
// not handling empty list correctly, the ParquetRecordReaderWrapper ends up adding the same column ids multiple
|
||||||
// times which ultimately breaks the query.
|
// times which ultimately breaks the query.
|
||||||
// TODO : Find why RO view works fine but RT doesn't, JIRA: https://issues.apache.org/jira/browse/HUDI-151
|
|
||||||
public static String READ_COLUMN_IDS;
|
|
||||||
public static String READ_COLUMN_NAMES;
|
|
||||||
public static boolean isReadColumnsSet = false;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
|
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
|
||||||
@@ -208,11 +203,6 @@ public class HoodieRealtimeInputFormat extends HoodieInputFormat implements Conf
|
|||||||
HOODIE_COMMIT_TIME_COL_POS);
|
HOODIE_COMMIT_TIME_COL_POS);
|
||||||
configuration = addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD,
|
configuration = addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD,
|
||||||
HOODIE_PARTITION_PATH_COL_POS);
|
HOODIE_PARTITION_PATH_COL_POS);
|
||||||
if (!isReadColumnsSet) {
|
|
||||||
READ_COLUMN_IDS = configuration.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
|
|
||||||
READ_COLUMN_NAMES = configuration.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
|
|
||||||
isReadColumnsSet = true;
|
|
||||||
}
|
|
||||||
return configuration;
|
return configuration;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -240,10 +230,6 @@ public class HoodieRealtimeInputFormat extends HoodieInputFormat implements Conf
|
|||||||
"HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with "
|
"HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with "
|
||||||
+ split);
|
+ split);
|
||||||
|
|
||||||
// Reset the original column ids and names
|
|
||||||
job.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS);
|
|
||||||
job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES);
|
|
||||||
|
|
||||||
return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job,
|
return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job,
|
||||||
super.getRecordReader(split, job, reporter));
|
super.getRecordReader(split, job, reporter));
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user