Hello I am getting below error on my EMR zepplin notebook using pyspark to write hudi data. I read multiple stackoverflow post and as explained I tried with libfb303-0.9.3.jar, libthrift-0.9.3.jar
and libfb303-0.9.2.jar, libthrift-0.9.2.jar
independently but still getting below error. I am following this AWS Hudi example and added apache hudi jars to the cluster as mentioned in it.
%pyspark
inputDF = spark.createDataFrame(
[
("100", "2015-01-01", "2015-01-01T13:51:39.340396Z"),
("101", "2015-01-01", "2015-01-01T12:14:58.597216Z"),
("102", "2015-01-01", "2015-01-01T13:51:40.417052Z"),
("103", "2015-01-01", "2015-01-01T13:51:40.519832Z"),
("104", "2015-01-02", "2015-01-01T12:15:00.512679Z"),
("105", "2015-01-02", "2015-01-01T13:51:42.248818Z"),
],
["id", "creation_date", "last_update_time"]
)
External spark dependencies in my session:
%pyspark
from pyspark.sql import SparkSession
import calendar
app_name = "soundbar"
spark = SparkSession.builder.appName(app_name).getOrCreate()
sc = spark.sparkContext
print(spark.sparkContext._jsc.sc().listJars())
ArrayBuffer(spark://ip-172-30-5-107.ec2.internal:38487/jars/libfb303-0.9.3.jar, spark://ip-172-30-5-107.ec2.internal:38487/jars/spark-interpreter-0.8.1.jar, spark://ip-172-30-5-107.ec2.internal:38487/jars/spark-avro_2.11-2.4.7.jar, spark://ip-172-30-5-107.ec2.internal:38487/jars/libthrift-0.9.3.jar, spark://ip-172-30-5-107.ec2.internal:38487/jars/hudi-spark-bundle_2.11-0.6.0.jar, spark://ip-172-30-5-107.ec2.internal:38487/jars/hudi-hive-sync-bundle-0.6.0.jar)
%pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
import re
inputDF.write.format('hudi').option('hoodie.datasource.write.operation', 'insert').options(**hudiOptions).mode('overwrite').save('s3://vd-dev-smarttvdata-di2-testing-virginia/anup-di2-dev/data/hudi/')
Py4JJavaError: An error occurred while calling o158.save.
: java.lang.NoSuchMethodError: com.facebook.fb303.FacebookService$Client.sendBaseOneway(Ljava/lang/String;Lorg/apache/thrift/TBase;)V
at com.facebook.fb303.FacebookService$Client.send_shutdown(FacebookService.java:436)
at com.facebook.fb303.FacebookService$Client.shutdown(FacebookService.java:430)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.close(HiveMetaStoreClient.java:492)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:152)
at com.sun.proxy.$Proxy86.close(Unknown Source)
at org.apache.hadoop.hive.ql.metadata.Hive.close(Hive.java:292)
at org.apache.hadoop.hive.ql.metadata.Hive.access$000(Hive.java:138)
at org.apache.hadoop.hive.ql.metadata.Hive$1.remove(Hive.java:158)
at org.apache.hadoop.hive.ql.metadata.Hive.closeCurrent(Hive.java:262)
at org.apache.hadoop.hive.ql.metadata.Hive.get(Hive.java:232)
at org.apache.hadoop.hive.ql.metadata.Hive.get(Hive.java:209)
at org.apache.hudi.hive.HoodieHiveClient.<init>(HoodieHiveClient.java:98)
at org.apache.hudi.hive.HiveSyncTool.<init>(HiveSyncTool.java:66)
at org.apache.hudi.HoodieSparkSqlWriter$.org$apache$hudi$HoodieSparkSqlWriter$$syncHive(HoodieSparkSqlWriter.scala:321)
at org.apache.hudi.HoodieSparkSqlWriter$$anonfun$metaSync$2.apply(HoodieSparkSqlWriter.scala:363)
at org.apache.hudi.HoodieSparkSqlWriter$$anonfun$metaSync$2.apply(HoodieSparkSqlWriter.scala:359)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
at org.apache.hudi.HoodieSparkSqlWriter$.metaSync(HoodieSparkSqlWriter.scala:359)
at org.apache.hudi.HoodieSparkSqlWriter$.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:417)
at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:205)
at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:125)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:156)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
(<class 'py4j.protocol.Py4JJavaError'>, Py4JJavaError(u'An error occurred while calling o158.save.
', JavaObject id=o159), <traceback object at 0x7fd5cb633830>)