I have an external partitioned table defined in Glue catalog, with data stored in S3.
When I run MSCK REPAIR TABLE {table}
, then I'm able to add partitions to the table and query it in Athena, as expected.
But when I add partition using boto3 APIs, then I get the following error ,when trying to query in Athena:
HIVE_UNKNOWN_ERROR: Can not create a Path from an empty string
I'm using the following code to add partitions:
partitions = {
"Values": [
"2020",
"07",
"01"
],
"StorageDescriptor": {
"Columns": [
{
"Name": "_hoodie_commit_time",
"Type": "string"
},
{
"Name": "_hoodie_commit_seqno",
"Type": "string"
},
{
"Name": "_hoodie_record_key",
"Type": "string"
},
{
"Name": "_hoodie_partition_path",
"Type": "string"
},
{
"Name": "_hoodie_file_name",
"Type": "string"
},
{
...some more columns
}
],
"InputFormat": "org.apache.hudi.hadoop.HoodieParquetInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"Compressed": False,
"NumberOfBuckets": -1,
"SerdeInfo": {
"SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
"Parameters": {
"serialization.format": "1"
}
}
}
}
glue_conn.create_partition(
DatabaseName={database_name},
TableName={table_name},
PartitionInput=partitions
)
I also ran describe formatted test partition (year='2020', month='07', day='01');
and it looks exactly the same with the above two methods except for the CreateTime
field:
# col_name data_type comment
_hoodie_commit_time string
_hoodie_commit_seqno string
_hoodie_record_key string
_hoodie_partition_path string
_hoodie_file_name string
...some more columns
# Partition Information
# col_name data_type comment
year string
month string
day string
# Detailed Partition Information
Partition Value: [2020, 07, 01]
Database: bidb
Table: test
CreateTime: Tue Jan 12 12:08:44 UTC 2021
LastAccessTime: UNKNOWN
Location: s3://location
# Storage Information
SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
InputFormat: org.apache.hudi.hadoop.HoodieParquetInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
Table definition:
hive> show create table static_portfolio;
OK
CREATE EXTERNAL TABLE `static_portfolio`(
`_hoodie_commit_time` string,
`_hoodie_commit_seqno` string,
`_hoodie_record_key` string,
`_hoodie_partition_path` string,
`_hoodie_file_name` string,
...some more columns)
PARTITIONED BY (
`year` string,
`month` string,
`day` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hudi.hadoop.HoodieParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3:/location'
TBLPROPERTIES (
'bucketing_version'='2',
'transient_lastDdlTime'='1610446119')