Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIVE-28581: Support Partition Prunning stats optimization for Iceberg tables #5498

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions iceberg/checkstyle/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
org.apache.iceberg.MetadataTableType.*,
org.apache.iceberg.SortDirection.*,
org.apache.iceberg.TableProperties.*,
org.apache.iceberg.SnapshotSummary.*,
org.apache.iceberg.types.Type.*,
org.apache.iceberg.types.Types.NestedField.*,
org.apache.parquet.schema.OriginalType.*,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.Optional;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
Expand Down Expand Up @@ -460,6 +461,7 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr
HiveTableUtil.appendFiles(fromURI, format, icebergTbl, isOverwrite, partitionSpec, conf);
}

@SuppressWarnings("checkstyle:CyclomaticComplexity")
@Override
public Map<String, String> getBasicStatistics(Partish partish) {
org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
Expand All @@ -471,6 +473,19 @@ public Map<String, String> getBasicStatistics(Partish partish) {
Map<String, String> summary = table.currentSnapshot().summary();
if (summary != null) {

if (Boolean.parseBoolean(summary.get(SnapshotSummary.PARTITION_SUMMARY_PROP))) {
String key = SnapshotSummary.CHANGED_PARTITION_PREFIX + partish.getPartition().getName();
Map<String, String> map = Maps.newHashMap();

StringTokenizer tokenizer = new StringTokenizer(summary.get(key), ",");
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
String[] keyValue = token.split("=");
map.put(keyValue[0], keyValue[1]);
}
summary = map;
}

if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
}
Expand Down Expand Up @@ -1979,11 +1994,18 @@ public List<Partition> getPartitions(org.apache.hadoop.hive.ql.metadata.Table ta
.map(partName -> {
Map<String, String> partSpecMap = Maps.newLinkedHashMap();
Warehouse.makeSpecFromName(partSpecMap, new Path(partName), null);
return new DummyPartition(table, partName, partSpecMap);
try {
return new DummyPartition(table, partName, partSpecMap);
} catch (HiveException e) {
throw new RuntimeException("Unable to construct name for dummy partition due to: ", e);
}
}).collect(Collectors.toList());
}

public boolean isPartitioned(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
if (hmsTable.getSd().getLocation() == null) {
return false;
}
Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
return table.spec().isPartitioned();
}
Expand All @@ -2000,7 +2022,7 @@ private Partition getPartitionImpl(org.apache.hadoop.hive.ql.metadata.Table tabl
try {
String partName = Warehouse.makePartName(partitionSpec, false);
return new DummyPartition(table, partName, partitionSpec);
} catch (MetaException e) {
} catch (MetaException | HiveException e) {
throw new SemanticException("Unable to construct name for dummy partition due to: ", e);
}
}
Expand Down Expand Up @@ -2099,6 +2121,9 @@ public boolean canPerformMetadataDelete(org.apache.hadoop.hive.ql.metadata.Table

@Override
public List<FieldSchema> getPartitionKeys(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
if (hmsTable.getSd().getLocation() == null) {
return null;
}
Table icebergTable = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
return IcebergTableUtil.getPartitionKeys(icebergTable, icebergTable.spec().specId());
}
Expand Down Expand Up @@ -2129,7 +2154,12 @@ public List<Partition> getPartitionsByExpr(org.apache.hadoop.hive.ql.metadata.Ta
String partName = spec.partitionToPath(partitionData);
Map<String, String> partSpecMap = Maps.newLinkedHashMap();
Warehouse.makeSpecFromName(partSpecMap, new Path(partName), null);
DummyPartition partition = new DummyPartition(hmsTable, partName, partSpecMap);
DummyPartition partition;
try {
partition = new DummyPartition(hmsTable, partName, partSpecMap);
} catch (HiveException e) {
throw new RuntimeException("Unable to construct name for dummy partition due to: ", e);
}
partitions.add(partition);
}
});
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set hive.explain.user=false;
set hive.fetch.task.conversion=none;
set hive.analyze.stmt.collect.partlevel.stats=false;

create external table ice01 (`i` int, `t` timestamp)
partitioned by (year int, month int, day int)
stored by iceberg tblproperties ('format-version'='2', 'write.summary.partition-limit'='10');

insert into ice01 (i, year, month, day) values
(1, 2023, 10, 3),
(2, 2023, 10, 3),
(2, 2023, 10, 3),
(3, 2023, 10, 4),
(4, 2023, 10, 4);

explain
select i from ice01 where year=2023 and month = 10 and day = 3;
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
PREHOOK: query: create external table ice01 (`i` int, `t` timestamp)
partitioned by (year int, month int, day int)
stored by iceberg tblproperties ('format-version'='2', 'write.summary.partition-limit'='10')
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice01
POSTHOOK: query: create external table ice01 (`i` int, `t` timestamp)
partitioned by (year int, month int, day int)
stored by iceberg tblproperties ('format-version'='2', 'write.summary.partition-limit'='10')
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice01
PREHOOK: query: insert into ice01 (i, year, month, day) values
(1, 2023, 10, 3),
(2, 2023, 10, 3),
(2, 2023, 10, 3),
(3, 2023, 10, 4),
(4, 2023, 10, 4)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@ice01
POSTHOOK: query: insert into ice01 (i, year, month, day) values
(1, 2023, 10, 3),
(2, 2023, 10, 3),
(2, 2023, 10, 3),
(3, 2023, 10, 4),
(4, 2023, 10, 4)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@ice01
PREHOOK: query: explain
select i from ice01 where year=2023 and month = 10 and day = 3
PREHOOK: type: QUERY
PREHOOK: Input: default@ice01
PREHOOK: Input: default@ice01@year=2023/month=10/day=3
PREHOOK: Output: hdfs://### HDFS PATH ###
POSTHOOK: query: explain
select i from ice01 where year=2023 and month = 10 and day = 3
POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Input: default@ice01@year=2023/month=10/day=3
POSTHOOK: Output: hdfs://### HDFS PATH ###
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1

STAGE PLANS:
Stage: Stage-1
Tez
#### A masked pattern was here ####
Vertices:
Map 1
Map Operator Tree:
TableScan
alias: ice01
filterExpr: ((year = 2023) and (month = 10) and (day = 3)) (type: boolean)
Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: ((year = 2023) and (month = 10) and (day = 3)) (type: boolean)
Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: i (type: int)
outputColumnNames: _col0
Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink

1 change: 1 addition & 0 deletions iceberg/patched-iceberg-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
<excludes>
**/HadoopInputFile.class
**/HadoopTableOperations.class
**/SnapshotProducer.class
</excludes>
</artifactItem>
</artifactItems>
Expand Down
Loading
Loading