Вся В процессе обучения используется TensorFlow 2.7.0 (автоматический кодировщик на Python) для создания моделей .h5, которые преобразуются в файлы моделей .onnx, а затем в файлы .engine для платформы Jetson (Jetson AGX Xavier). CUDA).
Спецификации Jetson AGX Xavier:
cuda: 11.4.315
cuDNN: 8.6.0
tensorRT: 8.5.2.2
jetpack: 5.1.3
python3 -c "import tensorflow как tf; print('TensorFlow version:', tf.версия)"
Версия TensorFlow: 2.11.0
Скрипт автоматического обучения кодировщика на Python (пример):
Код: Выделить всё
input_img = tf.keras.layers.Input(shape=(2000, lines))
# Encoder
x = tf.keras.layers.Conv1D(12, 128, padding='same')(input_img)
x = tf.keras.layers.MaxPooling1D(4)(x) # Downsample: 2000 -> 500
x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)
x = tf.keras.layers.MaxPooling1D(2)(x) # Downsample: 500 -> 250
x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)
x = tf.keras.layers.MaxPooling1D(2)(x) # Downsample: 250 -> 125
# Bottleneck
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(self.__config['MODEL']['ENCODED_STATE_SIZE'])(x)
# Decoder
x = tf.keras.layers.Dense(125 * 12)(x) # Expand to match last encoder feature size
x = tf.keras.layers.Reshape((125, 12))(x)
x = tf.keras.layers.UpSampling1D(2)(x) # Upsample: 125 -> 250
x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)
x = tf.keras.layers.UpSampling1D(2)(x) # Upsample: 250 -> 500
x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)
x = tf.keras.layers.UpSampling1D(4)(x) # Upsample: 500 -> 2000
x = tf.keras.layers.Conv1D(lines, 128, padding='same')(x) # Correct Final Layer
# Model definition
self.__model = tf.keras.models.Model(input_img, x)
Ниже вы можете увидеть две сравнительные диаграммы со значениями вывода.

Не предполагайте что данные могут быть повреждены, я собрал достаточно данных для обучения в обоих случаях и проверил их достоверность.
Смущает то, что вывод работает в Python с использованием TensorFlow 2.7.0. с графическим процессором, Ubuntu Focal x86_64... Я имею в виду, что я видел разные значения на двух диаграммах.
В Jetson я создал сценарий py для преобразования файла модели .h5 в .onnx и затем в формат .engine:
Код: Выделить всё
import tf2onnx
import tensorflow as tf
import argparse
import subprocess
def convert_h5_to_onnx(h5_model_path, onnx_model_path):
print("Converting .h5 model to ONNX...")
model = tf.keras.models.load_model(h5_model_path)
model_proto, _ = tf2onnx.convert.from_keras(model, opset=13)
with open(onnx_model_path, "wb") as f:
f.write(model_proto.SerializeToString())
print(f"ONNX model saved at {onnx_model_path}")
def convert_onnx_to_trt(onnx_model_path, engine_model_path, trt_precision_mode):
print("Converting ONNX model to TensorRT Engine...")
fp_precision_flag = '--fp16' if trt_precision_mode.upper() == 'FP16' else ''
trtexec_path = "/usr/src/tensorrt/bin/trtexec"
command = f"{trtexec_path} --onnx={onnx_model_path} --saveEngine={engine_model_path} {fp_precision_flag}"
process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
print(f"Error in converting to TensorRT engine:\n{process.stderr.decode('utf-8')}")
else:
print(f"TensorRT engine saved at {engine_model_path}")
# Main
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert a .h5 model to ONNX and TensorRT engine format")
parser.add_argument("--h5_model_path", type=str, required=True, help="Path to the .h5 model file")
parser.add_argument("--onnx_model_path", type=str, required=True, help="Path to save the converted ONNX model")
parser.add_argument("--engine_model_path", type=str, required=True, help="Path to save the converted TensorRT engine")
parser.add_argument("--trt_precision_mode", type=str, choices=['FP32', 'FP16'], default="FP16", help="Precision mode for TensorRT engine (FP32 or FP16)")
args = parser.parse_args()
convert_h5_to_onnx(args.h5_model_path, args.onnx_model_path)
convert_onnx_to_trt(args.onnx_model_path, args.engine_model_path, args.trt_precision_mode)
Код: Выделить всё
void RunInference(ICudaEngine* engine, IExecutionContext* context, int input_index, int output_index, kiss_fft_cpx* x_fft, kiss_fft_cpx* y_fft, kiss_fft_cpx* z_fft, float* predicted_output, int g_code, const char* clientName) {
int batchSize = 1;
int input_size = batchSize * 2000 * 3 * sizeof(float); // [1, 2000, 3]
int output_size = batchSize * 3 * sizeof(float); // [1, 3]
// Prepare normalized input data and set DC component to zero
float input_data[2000 * 3];
const int MN = 4000;
for (int i = 0; i < 2000; i++) {
input_data[i * 3 + 0] = sqrt(x_fft[i].r * x_fft[i].r + x_fft[i].i * x_fft[i].i) / MN;
input_data[i * 3 + 1] = sqrt(y_fft[i].r * y_fft[i].r + y_fft[i].i * y_fft[i].i) / MN;
input_data[i * 3 + 2] = sqrt(z_fft[i].r * z_fft[i].r + z_fft[i].i * z_fft[i].i) / MN;
}
// Set DC component to zero
input_data[0] = 0; // X-axis
input_data[1] = 0; // Y-axis
input_data[2] = 0; // Z-axis
////Allocate GPU buffers for input and output
void* buffers[2];
write_log(LOG_DEBUG, "RunInference for '%s' - input_index = %d, output_index = %d", clientName, input_index, output_index);
if (cudaMalloc(&buffers[input_index], input_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for input buffer", clientName);
return;
}
if (cudaMalloc(&buffers[output_index], output_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for output buffer", clientName);
cudaFree(buffers[input_index]);
return;
}
if (cudaMemset(buffers[input_index], 0, input_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset input buffer to zero", clientName);
return;
}
if (cudaMemset(buffers[output_index], 0, output_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset output buffer to zero", clientName);
return;
}
///////////////////
// Copy the input data to the GPU
cudaMemcpy(buffers[input_index], input_data, input_size, cudaMemcpyHostToDevice);
// Launch inference
cudaStream_t stream;
cudaStreamCreate(&stream);
context->enqueueV2(buffers, stream, nullptr);
cudaStreamSynchronize(stream);
// Copy the output data from GPU to CPU
cudaMemcpy(predicted_output, buffers[output_index], output_size, cudaMemcpyDeviceToHost);
// Free GPU memory
cudaFree(buffers[input_index]);
cudaFree(buffers[output_index]);
cudaStreamDestroy(stream);
}
Код: Выделить всё
IRuntime* runtime = createInferRuntime(gLogger);
if (!runtime) {
write_log(LOG_ERROR, "client_handler: Failed to create runtime for client %s", client.ClientName);
return (void*)-1;
}
std::vector engine_data = loadEngine(client.ModelPath, client.ClientName);
ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size(), nullptr);
if (!engine) {
write_log(LOG_ERROR, "client_handler: Failed to create engine for thread %s", client.ClientName);
return (void*)-1;
}
IExecutionContext* context = engine->createExecutionContext();
if (!context) {
write_log(LOG_ERROR, "client_handler: Failed to create execution context for thread %s", client.ClientName);
engine->destroy();
return (void*)-1;
}
int input_index = engine->getBindingIndex(client.ModelInputBindingName) ;//get from config file
int output_index = engine->getBindingIndex(client.ModelOutputBindingName); //get from config file
RunInference(engine, context, input_index, output_index, x_fft, y_fft, z_fft, predicted_output, client.G_code, client.ClientName);
// Synchronize the GPU to ensure all operations are completed
cudaDeviceSynchronize();
// Check for CUDA errors after synchronization
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
write_log(LOG_ERROR, "CUDA error after synchronization in thread '%s': %s", client.ClientName, cudaGetErrorString(err));
} else {
write_log(LOG_INFO, "GPU synchronized successfully for thread '%s'", client.ClientName);
}
context->destroy();
engine->destroy();
runtime->destroy();
У вас есть какие-либо предложения?
Подробнее здесь: https://stackoverflow.com/questions/793 ... agx-xavier