Я могу сделать это с помощью Pyspark, пример кода приведен ниже:
Код: Выделить всё
from pyspark.sql import SparkSession
from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
existing_model_path = "/home/geet/test/xgboostexp/testmodel/stages/2_SparkXGBClassifier_5adc1ebbc456"
xgb_model = SparkXGBClassifierModel.load(existing_model_path)
new_data_path = "/home/geet/test/xgboostexp/output_file.csv"
data = spark.read.option("header", True).csv(new_data_path)
for col_name in ["V1", "V2", "V3"]:
data = data.withColumn(col_name, col(col_name).cast('float'))
label_indexer = StringIndexer(inputCol="class", outputCol="indexedLabel").fit(data)
assembler = VectorAssembler(inputCols=["V1", "V2", "V3"], outputCol="features")
xgb_classifier = SparkXGBClassifier(
label_col="indexedLabel",
features_col="features",
num_workers=1,
xgb_model=xgb_model.get_booster() # Use the previous model as the base
)
pipeline = Pipeline(stages=[label_indexer, assembler, xgb_classifier])
pipeline_model = pipeline.fit(data)
updated_model_path = "/home/geet/test/xgboostexp/testmodel2"
pipeline_model.write().overwrite()
Подробнее здесь: https://stackoverflow.com/questions/790 ... l-pipeline