std::shared_ptr results_tensor = std::make_shared();
std::shared_ptr model;
while(true)
{
auto start = std::chrono::high_resolution_clock::now();
*results_tensor = nn.model->forward({input});
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration duration = end - start;
{
std::lock_guard lock(cout_mutex);
std::cout forward(x);
return std::make_tuple(policy, value, action_values);
}
};
int main() {
// Initialize the network
Net net;
net.to(torch::kCUDA);
// Create a random input tensor (batch size 1, 4 channels, 20x20 size)
at::Tensor input_tensor = torch::rand({128, 4, 20, 20}, torch::kCUDA);
// Pass the network through the input data and get the output
for(int i = 0; i < 10; i++)
{
std::this_thread::sleep_for(std::chrono::seconds(60)); // pretend some calculations are being performed
// forward pass I
{
auto cpu_start = std::chrono::high_resolution_clock::now();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0); // Start timing
auto output = net.forward(input_tensor);
cudaEventRecord(stop, 0); // End timing
cudaEventSynchronize(stop); // Wait for GPU to finish all operations
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop); // Calculate time
std::cout
Подробнее здесь: [url]https://stackoverflow.com/questions/79360980/what-caused-the-large-difference-in-execution-time-between-these-two-forward-pas[/url]
[code]for i in range(263): states = torch.rand(128, 4, 20, 20, device = device) # time.sleep(1) start_time = time.time() policy, value, action_value = model(states) end_time = time.time() total_time = end_time - start_time print(f"time cost:{total_time:.4f} second") [/code] он печатается как: [code]time cost:0.0003 second time cost:0.0003 second time cost:0.0003 second ... [/code] Но когда я раскомментирую time.sleep(1), он становится медленным: [code]time cost:0.0008 second time cost:0.0009 second time cost:0.0009 second [/code] В моей программе на C++ с libtorch, которая имеет более 200 строк, поэтому я не размещаю ее здесь, разница еще больше [code]std::shared_ptr results_tensor = std::make_shared(); std::shared_ptr model; while(true) { auto start = std::chrono::high_resolution_clock::now();
*results_tensor = nn.model->forward({input});
auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; { std::lock_guard lock(cout_mutex); std::cout forward(x);
// Pass the network through the input data and get the output for(int i = 0; i < 10; i++) { std::this_thread::sleep_for(std::chrono::seconds(60)); // pretend some calculations are being performed // forward pass I { auto cpu_start = std::chrono::high_resolution_clock::now();
cudaEventSynchronize(stop); // Wait for GPU to finish all operations float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); // Calculate time