apache · deniskuzZ · Oct 9, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/iceberg/checkstyle/checkstyle.xml b/iceberg/checkstyle/checkstyle.xml
@@ -127,6 +127,7 @@
                 org.apache.iceberg.MetadataTableType.*,
                 org.apache.iceberg.SortDirection.*,
                 org.apache.iceberg.TableProperties.*,
+                org.apache.iceberg.SnapshotSummary.*,
                 org.apache.iceberg.types.Type.*,
                 org.apache.iceberg.types.Types.NestedField.*,
                 org.apache.parquet.schema.OriginalType.*,

diff --git a/...g/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/...g/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -36,6 +36,7 @@
 import java.util.Optional;
 import java.util.Properties;
 import java.util.Set;
+import java.util.StringTokenizer;
 import java.util.UUID;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@@ -460,6 +461,7 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr
     HiveTableUtil.appendFiles(fromURI, format, icebergTbl, isOverwrite, partitionSpec, conf);
   }
 
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
   @Override
   public Map<String, String> getBasicStatistics(Partish partish) {
     org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
@@ -471,6 +473,19 @@ public Map<String, String> getBasicStatistics(Partish partish) {
         Map<String, String> summary = table.currentSnapshot().summary();
         if (summary != null) {
 
+          if (Boolean.parseBoolean(summary.get(SnapshotSummary.PARTITION_SUMMARY_PROP))) {
+            String key = SnapshotSummary.CHANGED_PARTITION_PREFIX + partish.getPartition().getName();
+            Map<String, String> map = Maps.newHashMap();
+
+            StringTokenizer tokenizer = new StringTokenizer(summary.get(key), ",");
+            while (tokenizer.hasMoreTokens()) {
+              String token = tokenizer.nextToken();
+              String[] keyValue = token.split("=");
+              map.put(keyValue[0], keyValue[1]);
+            }
+            summary = map;
+          }
+
           if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
             stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
           }
@@ -1979,11 +1994,18 @@ public List<Partition> getPartitions(org.apache.hadoop.hive.ql.metadata.Table ta
         .map(partName -> {
           Map<String, String> partSpecMap = Maps.newLinkedHashMap();
           Warehouse.makeSpecFromName(partSpecMap, new Path(partName), null);
-          return new DummyPartition(table, partName, partSpecMap);
+          try {
+            return new DummyPartition(table, partName, partSpecMap);
+          } catch (HiveException e) {
+            throw new RuntimeException("Unable to construct name for dummy partition due to: ", e);
+          }
         }).collect(Collectors.toList());
   }
 
   public boolean isPartitioned(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+    if (hmsTable.getSd().getLocation() == null) {
+      return false;
+    }
     Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
     return table.spec().isPartitioned();
   }
@@ -2000,7 +2022,7 @@ private Partition getPartitionImpl(org.apache.hadoop.hive.ql.metadata.Table tabl
     try {
       String partName = Warehouse.makePartName(partitionSpec, false);
       return new DummyPartition(table, partName, partitionSpec);
-    } catch (MetaException e) {
+    } catch (MetaException | HiveException e) {
       throw new SemanticException("Unable to construct name for dummy partition due to: ", e);
     }
   }
@@ -2099,6 +2121,9 @@ public boolean canPerformMetadataDelete(org.apache.hadoop.hive.ql.metadata.Table
 
   @Override
   public List<FieldSchema> getPartitionKeys(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+    if (hmsTable.getSd().getLocation() == null) {
+      return null;
+    }
     Table icebergTable = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
     return IcebergTableUtil.getPartitionKeys(icebergTable, icebergTable.spec().specId());
   }
@@ -2129,7 +2154,12 @@ public List<Partition> getPartitionsByExpr(org.apache.hadoop.hive.ql.metadata.Ta
           String partName = spec.partitionToPath(partitionData);
           Map<String, String> partSpecMap = Maps.newLinkedHashMap();
           Warehouse.makeSpecFromName(partSpecMap, new Path(partName), null);
-          DummyPartition partition = new DummyPartition(hmsTable, partName, partSpecMap);
+          DummyPartition partition;
+          try {
+            partition = new DummyPartition(hmsTable, partName, partSpecMap);
+          } catch (HiveException e) {
+            throw new RuntimeException("Unable to construct name for dummy partition due to: ", e);
+          }
           partitions.add(partition);
         }
       });

diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats_with_ppr.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats_with_ppr.q
@@ -0,0 +1,17 @@
+set hive.explain.user=false;
+set hive.fetch.task.conversion=none;
+set hive.analyze.stmt.collect.partlevel.stats=false;
+
+create external table ice01 (`i` int, `t` timestamp) 
+    partitioned by (year int, month int, day int) 
+stored by iceberg tblproperties ('format-version'='2', 'write.summary.partition-limit'='10');
+
+insert into ice01 (i, year, month, day) values
+(1, 2023, 10, 3),
+(2, 2023, 10, 3),
+(2, 2023, 10, 3),
+(3, 2023, 10, 4),
+(4, 2023, 10, 4);
+
+explain
+select i from ice01 where year=2023 and month = 10 and day = 3;
diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats_with_ppr.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats_with_ppr.q.out
@@ -0,0 +1,78 @@
+PREHOOK: query: create external table ice01 (`i` int, `t` timestamp) 
+    partitioned by (year int, month int, day int) 
+stored by iceberg tblproperties ('format-version'='2', 'write.summary.partition-limit'='10')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice01
+POSTHOOK: query: create external table ice01 (`i` int, `t` timestamp) 
+    partitioned by (year int, month int, day int) 
+stored by iceberg tblproperties ('format-version'='2', 'write.summary.partition-limit'='10')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice01
+PREHOOK: query: insert into ice01 (i, year, month, day) values
+(1, 2023, 10, 3),
+(2, 2023, 10, 3),
+(2, 2023, 10, 3),
+(3, 2023, 10, 4),
+(4, 2023, 10, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice01
+POSTHOOK: query: insert into ice01 (i, year, month, day) values
+(1, 2023, 10, 3),
+(2, 2023, 10, 3),
+(2, 2023, 10, 3),
+(3, 2023, 10, 4),
+(4, 2023, 10, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice01
+PREHOOK: query: explain
+select i from ice01 where year=2023 and month = 10 and day = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice01
+PREHOOK: Input: default@ice01@year=2023/month=10/day=3
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain
+select i from ice01 where year=2023 and month = 10 and day = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice01
+POSTHOOK: Input: default@ice01@year=2023/month=10/day=3
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: ice01
+                  filterExpr: ((year = 2023) and (month = 10) and (day = 3)) (type: boolean)
+                  Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: ((year = 2023) and (month = 10) and (day = 3)) (type: boolean)
+                    Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: i (type: int)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: NONE
+                        table:
+                            input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
diff --git a/iceberg/patched-iceberg-core/pom.xml b/iceberg/patched-iceberg-core/pom.xml
@@ -96,6 +96,7 @@
                   <excludes>
                       **/HadoopInputFile.class
                       **/HadoopTableOperations.class
+                      **/SnapshotProducer.class
                   </excludes>
                 </artifactItem>
               </artifactItems>