Код: Выделить всё
import os
os.environ['SPARK_HOME'] = r'C:\Users\zulki\AppData\Local\Programs\Spark'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ['PYSPARK_DRIVER_PYTHON'] = r"C:\Users\zulki\AppData\Local\Programs\Python\Python311\python.exe"
os.environ['PYSPARK_PYTHON'] = r"C:\Users\zulki\AppData\Local\Programs\Python\Python311\python.exe"
Код: Выделить всё
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName("RDD-Spark-Demo")\
.getOrCreate()
Код: Выделить всё
data = [('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]
rdd2 = spark.sparkContext.parallelize(data)
print(rdd2.collect())
Код: Выделить всё
rdd2.saveAsTextFile("output.txt")
Код: Выделить всё
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[37], line 6
4 if os.path.exists(output_path):
5 os.remove(output_path) # Delete the file if it exists
----> 6 rdd2.saveAsTextFile(output_path)
File c:\Users\zulki\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py:3425, in RDD.saveAsTextFile(self, path, compressionCodecClass)
3423 keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path, compressionCodec)
3424 else:
-> 3425 keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
File c:\Users\zulki\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File c:\Users\zulki\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\errors\exceptions\captured.py:179, in capture_sql_exception..deco(*a, **kw)
177 def deco(*a: Any, **kw: Any) -> Any:
178 try:
--> 179 return f(*a, **kw)
180 except Py4JJavaError as e:
181 converted = convert_exception(e.java_exception)
File c:\Users\zulki\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o411.saveAsTextFile.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. - s e e h t t p s : / / w i k i . a p a c h e . o r g / h a d o o p / W i n d o w s P r o b l e m s < b r / > a t o r g . a p a c h e . h a d o o p . u t i l . S h e l l . g e t W i n U t i l s P a t h ( S h e l l . j a v a : 7 3 5 ) < b r / > a t o r g . a p a c h e . h a d o o p . u t i l . S h e l l . g e t S e t P e r m i s s i o n C o m m a n d ( S h e l l . j a v a : 2 7 0 ) < b r / > a t o r g . a p a c h e . h a d o o p . u t i l . S h e l l . g e t S e t P e r m i s s i o n C o m m a n d ( S h e l l . j a v a : 2 8 6 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . s e t P e r m i s s i o n ( R a w L o c a l F i l e S y s t e m . j a v a : 9 7 8 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . m k O n e D i r W i t h M o d e ( R a w L o c a l F i l e S y s t e m . j a v a : 6 6 0 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . m k d i r s W i t h O p t i o n a l P e r m i s s i o n ( R a w L o c a l F i l e S y s t e m . j a v a : 7 0 0 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . m k d i r s ( R a w L o c a l F i l e S y s t e m . j a v a : 6 7 2 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . m k d i r s W i t h O p t i o n a l P e r m i s s i o n ( R a w L o c a l F i l e S y s t e m . j a v a : 6 9 9 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . m k d i r s ( R a w L o c a l F i l e S y s t e m . j a v a : 6 7 2 ) < b r / > a t o r g . a p a c h e . h a d o o p . f s . R a w L o c a l F i l e S y s t e m . m k d i r s W i t h O p t i o n a l P e r m i s s i o n ( R a w L o c a l F i l e S y s t e m . j a v a : 6 9 9 ) < b r / > at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
at org.apache.hadoop.mapred.FileOutputCommitter.setupJob(FileOutputCommitter.java:131)
at org.apache.hadoop.mapred.OutputCommitter.setupJob(OutputCommitter.java:265)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:79)
at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1091)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1089)
at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1062)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1027)
at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1009)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1008)
at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:965)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:963)
at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1623)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1623)
at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1609)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1609)
at org.apache.spark.api.java.JavaRDDLike.saveAsTextFile(JavaRDDLike.scala:564)
at org.apache.spark.api.java.JavaRDDLike.saveAsTextFile$(JavaRDDLike.scala:563)
at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
at org.apache.hadoop.util.Shell.(Shell.java:688)
at org.apache.hadoop.util.StringUtils.(StringUtils.java:79)
at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
at org.apache.hadoop.util.ShutdownHookManager$HookEntry.(ShutdownHookManager.java:207)
at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
at org.apache.spark.util.ShutdownHookManager$.(ShutdownHookManager.scala:58)
at org.apache.spark.util.ShutdownHookManager$.(ShutdownHookManager.scala)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:242)
at org.apache.spark.util.SparkFileUtils.createTempDir(SparkFileUtils.scala:103)
at org.apache.spark.util.SparkFileUtils.createTempDir$(SparkFileUtils.scala:102)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:94)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:372)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:964)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:194)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:217)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1120)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1129)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
at org.apache.hadoop.util.Shell.(Shell.java:515)
... 25 more
Поэтому я посмотрел видео на YouTube, загрузил Hadoop и настроил для него XML-файлы, такие как видео.
После того, как я закончил просмотр, следовал шаг за шагом и запустил `hdfs namenode -format', чтобы проверить, работает ли это, но нет.
Код: Выделить всё
C:\Users\zulki>hdfs namenode -format
The system cannot find the path specified.
Error: JAVA_HOME is incorrectly set.
Please update C:\Users\zulki\AppData\Local\Programs\Hadoop\etc\hadoop\hadoop-env.cmd
'-Dhadoop.security.logger' is not recognized as an internal or external command,
operable program or batch file.
Код: Выделить всё
@rem The java implementation to use. Required.
set JAVA_HOME=C:\Program Files\Java\jdk-17
@rem The jsvc implementation to use. Jsvc is required to run secure datanodes.
@rem set JSVC_HOME=%JSVC_HOME%
Цель моего изучения Spark заключается в том, что у меня есть скрипт извлечения Python. Изначально время выполнения скриптов составляет 2 часа. Поскольку это занимает так много времени, я добавил модуль concurrent.features и сократил время выполнения до 17 минут. Но затем я хочу поместить его в AWS Lambda, у которого есть ограничение по времени ожидания в 15 минут. Поэтому я подумал, что если я использую PySpark, он завершится менее чем за 15 минут (надеюсь). Я просто хочу это проверить.
Подробнее здесь: https://stackoverflow.com/questions/786 ... ctly-on-wi