Anonymous
PicklingError: не удалось сериализовать объект: RecursionError в коде pyspark в блокноте Jupyter
Сообщение
Anonymous » 24 ноя 2025, 18:59
Я новичок в Spark (точнее, только начал изучать) и столкнулся с ошибкой рекурсии в очень простом коде.
Справочная информация:
Spark версии 3.5.7
Java версии 11.0.29 (Eclipse Adoptium)
Python 3.14
Все они установлены на локальном компьютере (т. е. Spark запускается в «локальном режиме»).
И у меня есть следующий код в моем блокноте Jupyter.
Код: Выделить всё
#cell1
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
#cell2
spark = SparkSession.builder.appName("Test").getOrCreate()
#cell3. Test1 (runs great, shows top 5 from myRange)
myRange = spark.range(1000).toDF("number")
myRange.show(5)
#cell4. Test2 (throws a recursion error)
myManualSchema = StructType([
StructField("some", StringType(), True),
StructField("col", StringType(), True),
StructField("names", LongType(), False)
])
myRow=Row("hello", None, 1)
df = spark.createDataFrame([myRow], myManualSchema)
По сути, создание простого искрового DataFrame вызывает ошибку:
Код: Выделить всё
PicklingError Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = spark.createDataFrame([myRow], myManualSchema)
File C:\spark\spark_env\Lib\site-packages\pyspark\sql\session.py:1443, in SparkSession.createDataFrame(self, data, schema, samplingRatio, verifySchema)
1438 if has_pandas and isinstance(data, pd.DataFrame):
1439 # Create a DataFrame from pandas DataFrame.
1440 return super(SparkSession, self).createDataFrame( # type: ignore[call-overload]
1441 data, schema, samplingRatio, verifySchema
1442 )
-> 1443 return self._create_dataframe(
1444 data, schema, samplingRatio, verifySchema # type: ignore[arg-type]
1445 )
File C:\spark\spark_env\Lib\site-packages\pyspark\sql\session.py:1487, in SparkSession._create_dataframe(self, data, schema, samplingRatio, verifySchema)
1485 rdd, struct = self._createFromLocal(map(prepare, data), schema)
1486 assert self._jvm is not None
-> 1487 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
1488 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), struct.json())
1489 df = DataFrame(jdf, self)
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:4918, in RDD._to_java_object_rdd(self)
4915 rdd = self._pickled()
4916 assert self.ctx._jvm is not None
-> 4918 return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True)
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5470, in PipelinedRDD._jrdd(self)
5467 else:
5468 profiler = None
-> 5470 wrapped_func = _wrap_function(
5471 self.ctx, self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer, profiler
5472 )
5474 assert self.ctx._jvm is not None
5475 python_rdd = self.ctx._jvm.PythonRDD(
5476 self._prev_jrdd.rdd(), wrapped_func, self.preservesPartitioning, self.is_barrier
5477 )
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5268, in _wrap_function(sc, func, deserializer, serializer, profiler)
5266 assert serializer, "serializer should not be empty"
5267 command = (func, profiler, deserializer, serializer)
-> 5268 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
5269 assert sc._jvm is not None
5270 return sc._jvm.SimplePythonFunction(
5271 bytearray(pickled_command),
5272 env,
(...) 5277 sc._javaAccumulator,
5278 )
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5251, in _prepare_for_python_RDD(sc, command)
5248 def _prepare_for_python_RDD(sc: "SparkContext", command: Any) -> Tuple[bytes, Any, Any, Any]:
5249 # the serialized command will be compressed by broadcast
5250 ser = CloudPickleSerializer()
-> 5251 pickled_command = ser.dumps(command)
5252 assert sc._jvm is not None
5253 if len(pickled_command) > sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M
5254 # The broadcast will have same life cycle as created PythonRDD
File C:\spark\spark_env\Lib\site-packages\pyspark\serializers.py:469, in CloudPickleSerializer.dumps(self, obj)
467 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
468 print_exec(sys.stderr)
--> 469 raise pickle.PicklingError(msg)
PicklingError: Could not serialize object: RecursionError: Stack overflow (used 2912 kB)
Будем благодарны за любую помощь или совет.
Подробнее здесь:
https://stackoverflow.com/questions/798 ... ode-in-jup
1763999946
Anonymous
Я новичок в Spark (точнее, только начал изучать) и столкнулся с ошибкой рекурсии в очень простом коде. [b]Справочная информация:[/b] [list] [*]Spark версии 3.5.7 [*]Java версии 11.0.29 (Eclipse Adoptium) [*]Python 3.14 [/list] Все они установлены на локальном компьютере (т. е. Spark запускается в «локальном режиме»). И у меня есть следующий код в моем блокноте Jupyter. [code] #cell1 from pyspark.sql import SparkSession from pyspark.sql import Row from pyspark.sql.types import StructField, StructType, StringType, LongType #cell2 spark = SparkSession.builder.appName("Test").getOrCreate() #cell3. Test1 (runs great, shows top 5 from myRange) myRange = spark.range(1000).toDF("number") myRange.show(5) #cell4. Test2 (throws a recursion error) myManualSchema = StructType([ StructField("some", StringType(), True), StructField("col", StringType(), True), StructField("names", LongType(), False) ]) myRow=Row("hello", None, 1) df = spark.createDataFrame([myRow], myManualSchema) [/code] По сути, создание простого искрового DataFrame вызывает ошибку: [code]PicklingError Traceback (most recent call last) Cell In[5], line 1 ----> 1 df = spark.createDataFrame([myRow], myManualSchema) File C:\spark\spark_env\Lib\site-packages\pyspark\sql\session.py:1443, in SparkSession.createDataFrame(self, data, schema, samplingRatio, verifySchema) 1438 if has_pandas and isinstance(data, pd.DataFrame): 1439 # Create a DataFrame from pandas DataFrame. 1440 return super(SparkSession, self).createDataFrame( # type: ignore[call-overload] 1441 data, schema, samplingRatio, verifySchema 1442 ) -> 1443 return self._create_dataframe( 1444 data, schema, samplingRatio, verifySchema # type: ignore[arg-type] 1445 ) File C:\spark\spark_env\Lib\site-packages\pyspark\sql\session.py:1487, in SparkSession._create_dataframe(self, data, schema, samplingRatio, verifySchema) 1485 rdd, struct = self._createFromLocal(map(prepare, data), schema) 1486 assert self._jvm is not None -> 1487 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) 1488 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), struct.json()) 1489 df = DataFrame(jdf, self) File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:4918, in RDD._to_java_object_rdd(self) 4915 rdd = self._pickled() 4916 assert self.ctx._jvm is not None -> 4918 return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True) File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5470, in PipelinedRDD._jrdd(self) 5467 else: 5468 profiler = None -> 5470 wrapped_func = _wrap_function( 5471 self.ctx, self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer, profiler 5472 ) 5474 assert self.ctx._jvm is not None 5475 python_rdd = self.ctx._jvm.PythonRDD( 5476 self._prev_jrdd.rdd(), wrapped_func, self.preservesPartitioning, self.is_barrier 5477 ) File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5268, in _wrap_function(sc, func, deserializer, serializer, profiler) 5266 assert serializer, "serializer should not be empty" 5267 command = (func, profiler, deserializer, serializer) -> 5268 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command) 5269 assert sc._jvm is not None 5270 return sc._jvm.SimplePythonFunction( 5271 bytearray(pickled_command), 5272 env, (...) 5277 sc._javaAccumulator, 5278 ) File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5251, in _prepare_for_python_RDD(sc, command) 5248 def _prepare_for_python_RDD(sc: "SparkContext", command: Any) -> Tuple[bytes, Any, Any, Any]: 5249 # the serialized command will be compressed by broadcast 5250 ser = CloudPickleSerializer() -> 5251 pickled_command = ser.dumps(command) 5252 assert sc._jvm is not None 5253 if len(pickled_command) > sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M 5254 # The broadcast will have same life cycle as created PythonRDD File C:\spark\spark_env\Lib\site-packages\pyspark\serializers.py:469, in CloudPickleSerializer.dumps(self, obj) 467 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg) 468 print_exec(sys.stderr) --> 469 raise pickle.PicklingError(msg) PicklingError: Could not serialize object: RecursionError: Stack overflow (used 2912 kB) [/code] Будем благодарны за любую помощь или совет. Подробнее здесь: [url]https://stackoverflow.com/questions/79828213/picklingerror-could-not-serialize-object-recursionerror-in-pyspark-code-in-jup[/url]