Распараллеливание Numba не повышает производительность при моделировании Монте-Карло? - Цифровое Кемерово

Распараллеливание Numba не повышает производительность при моделировании Монте-Карло? ⇐ Python

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Anonymous

Распараллеливание Numba не повышает производительность при моделировании Монте-Карло?

Цитата

Сообщение Anonymous » 01 июл 2024, 06:20

Это дополнительный вопрос к вопросу, который я задавал ранее, но я думаю, что мне следует начать все сначала. Я пытаюсь реализовать симуляцию числа Пи Монте-Карло и использую numba для повышения производительности. Поскольку каждая итерация цикла независима от других, я подумал, что смогу повысить производительность с помощью Parallel=True и numba.prange. Я попробовал и понял, что для небольших значений n распараллеливание того не стоит. Я попробовал улучшенную версию, в которой я использую распараллеливание после пересечения определенного порога для n, но обнаружил, что большую часть времени она работает хуже, чем мои предыдущие попытки. Теперь у меня есть сжатие трех версий алгоритма: обычная без распараллеливания, параллельная версия с использованием numba.prange и «улучшенная» гибридная версия, использующая распараллеливание после того, как указанный порог для n равен перечеркнуто:
from datetime import timedelta
from time import perf_counter

import numba as nb
import numpy as np
import numpy.typing as npt

jit_opts = dict(
nopython=True, nogil=True, cache=False, error_model="numpy", fastmath=True
)

rng = np.random.default_rng()

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=True,
)
def count_points_in_circle_parallel(
points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi_parallel(
n: int,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = count_points_in_circle_parallel(points)
return points, in_circle, 4 * count / n

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=False,
)
def count_points_in_circle(points: npt.NDArray[float]) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in range(points.shape[0]):
in_ = in_circle = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi(n: int) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = count_points_in_circle(points)
return points, in_circle, 4 * count / n

def count_points_in_circle_improved(
points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

count_points_in_circle_improved_parallel = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=True,
)(count_points_in_circle_improved)
count_points_in_circle_improved = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=False,
)(count_points_in_circle_improved)

def monte_carlo_pi_improved(
n: int, parallel_threshold: int = 1000
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = (
count_points_in_circle_improved_parallel(points)
if n > parallel_threshold
else count_points_in_circle_improved(points)
)
return points, in_circle, 4 * count / n

def main() -> None:
n_values = 10 ** np.arange(1, 9)
n_values = np.concatenate(
([10], n_values)
) # Duplicate 10 to avoid startup overhead
time_results = np.empty((len(n_values), 3), dtype=np.float64)

if jit_opts.get("cache", False):
print("Using cached JIT compilation")
else:
print("Using JIT compilation without caching")
print()

print("Using parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi_parallel(n)
end = perf_counter()
duration = end - start
time_results[i, 0] = duration
delta = timedelta(seconds=duration)
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Using non-parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 1] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Improved version:")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi_improved(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 2] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Comparison:")
result_types = ("parallel", "non-parallel", "improved")
for n, res in zip(n_values, time_results):
res_idx = np.argsort(res)
print(
f"n = {n:,}:".ljust(20),
f"{result_types[res_idx[0]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[1]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[2]]}",
)

if __name__ == "__main__":
main()

(Знаю-знаю, этот код не очень чистый и имеет повторы, но он предназначен для целей тестирования, и в итоге у меня получится один из алгоритмов) . Я попробовал запустить его с кэшем=True и кэш=False, чтобы проверить, помогает ли это в чем-то, но результаты оказались очень запутанными. Похоже, что иногда непараллельная версия работает быстрее даже при больших значениях n, а гибридная версия на самом деле ничего не улучшает. Вот пример результатов, которые я получаю:

Эти результаты очень запутанны и непоследовательны. В другом прогоне я понял, что непараллельная версия быстрее, а в другом — что параллельная версия быстрее. Похоже, я делаю что-то не так, но я не могу понять, что происходит. Почему я не вижу последовательного улучшения производительности в параллельной версии, особенно для больших значений n, и почему мой гибридный подход в большинстве случаев не улучшает производительность? Будем признательны за любое понимание того, что здесь происходит.
Изменить:
Следуя ответу @Jerome Richard, я изменил код, чтобы предварительно выделить буферы и повторно использовать их для всех моих тестов. Результаты по-прежнему кажутся мне странными: параллельная версия большую часть времени работает хуже всего, даже для больших n. Я даже включил n = 500 000 000, чтобы еще больше расширить границы (очевидно, мой компьютер не может обработать 1 000 000 000, поэтому пришлось сократить его вдвое), но параллельная версия по-прежнему работает хуже. Почему я не вижу каких-либо существенных улучшений в параллельной или гибридной версии алгоритма?
Измененный код:
from datetime import timedelta
from time import perf_counter

import numba as nb
import numpy as np
import numpy.typing as npt

jit_opts = dict(
nopython=True, nogil=True, cache=False, error_model="numpy", fastmath=True
)

rng = np.random.default_rng()

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=True,
)
def count_points_in_circle_parallel(
points: npt.NDArray[float], in_circle: npt.NDArray[bool]
) -> tuple[npt.NDArray[bool], int]:
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi_parallel(
n: int,
out: npt.NDArray[float] | None = None,
in_circle_out: npt.NDArray[bool] | None = None,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2), out=out)
if in_circle_out is None:
in_circle_out = np.empty(n, dtype=np.bool_)
in_circle, count = count_points_in_circle_parallel(points, in_circle_out)
return points, in_circle, 4 * count / n

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=False,
)
def count_points_in_circle(
points: npt.NDArray[float], in_circle: npt.NDArray[bool]
) -> tuple[npt.NDArray[bool], int]:
in_circle_count = 0
for i in range(points.shape[0]):
in_ = in_circle = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi(
n: int,
out: npt.NDArray[float] | None = None,
in_circle_out: npt.NDArray[bool] | None = None,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2), out=out)
if in_circle_out is None:
in_circle_out = np.empty(n, dtype=np.bool_)
in_circle, count = count_points_in_circle(points, in_circle_out)
return points, in_circle, 4 * count / n

def count_points_in_circle_improved(
points: npt.NDArray[float], in_circle: npt.NDArray[bool]
) -> tuple[npt.NDArray[bool], int]:
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

count_points_in_circle_improved_parallel = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=True,
)(count_points_in_circle_improved)
count_points_in_circle_improved = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=False,
)(count_points_in_circle_improved)

def monte_carlo_pi_improved(
n: int,
parallel_threshold: int = 1000,
out: npt.NDArray[float] | None = None,
in_circle_out: npt.NDArray[bool] | None = None,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2), out=out)
if in_circle_out is None:
in_circle_out = np.empty(n, dtype=np.bool_)
in_circle, count = (
count_points_in_circle_improved_parallel(points, in_circle_out)
if n > parallel_threshold
else count_points_in_circle_improved(points, in_circle_out)
)
return points, in_circle, 4 * count / n

def main() -> None:
n_values = 10 ** np.arange(1, 9)
n_values = np.concatenate(
([10], n_values, [500_000_000])
) # Duplicate 10 to avoid startup overhead
n_max = n_values.max()
buffer = np.empty((n_max, 2), dtype=np.float64)
in_circle_buffer = np.empty(n_max, dtype=np.bool_)
use_preallocated_buffer = False
time_results = np.empty((len(n_values), 3), dtype=np.float64)

if jit_opts.get("cache", False):
print("Using cached JIT compilation")
else:
print("Using JIT compilation without caching")
if use_preallocated_buffer:
print("Using preallocated buffers")
else:
print("Not using preallocated buffers")
print()

print("Using parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
if use_preallocated_buffer:
points, in_circle, pi_approx = monte_carlo_pi_parallel(
n, out=buffer[:n], in_circle_out=in_circle_buffer[:n]
)
else:
points, in_circle, pi_approx = monte_carlo_pi_parallel(n)
end = perf_counter()
duration = end - start
time_results[i, 0] = duration
delta = timedelta(seconds=duration)
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Using non-parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
if use_preallocated_buffer:
points, in_circle, pi_approx = monte_carlo_pi(
n, out=buffer[:n], in_circle_out=in_circle_buffer[:n]
)
else:
points, in_circle, pi_approx = monte_carlo_pi(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 1] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Improved version:")
for i, n in enumerate(n_values):
start = perf_counter()
if use_preallocated_buffer:
points, in_circle, pi_approx = monte_carlo_pi_improved(
n, out=buffer[:n], in_circle_out=in_circle_buffer[:n]
)
else:
points, in_circle, pi_approx = monte_carlo_pi_improved(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 2] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Comparison:")
result_types = ("parallel", "non-parallel", "improved")
for n, res in zip(n_values, time_results):
res_idx = np.argsort(res)
print(
f"n = {n:,}:".ljust(20),
f"{result_types[res_idx[0]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[1]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[2]]}",
)

if __name__ == "__main__":
main()

Результаты предварительного распределения:
Using JIT compilation without caching
Using preallocated buffers

Using parallel count_points_in_circle
n = 10: π ≈ 2.8 [0:00:00.018026 (Raw time: 0.018026399999996556 s)]
n = 10: π ≈ 2.4 [0:00:00.000072 (Raw time: 7.180000000062137e-05 s)]
n = 100: π ≈ 3.12 [0:00:00.000047 (Raw time: 4.7400000028119393e-05 s)]
n = 1,000: π ≈ 3.208 [0:00:00.000075 (Raw time: 7.499999998117346e-05 s)]
n = 10,000: π ≈ 3.1392 [0:00:00.000235 (Raw time: 0.00023540000000821237 s)]
n = 100,000: π ≈ 3.14048 [0:00:00.001509 (Raw time: 0.0015089999999986503 s)]
n = 1,000,000: π ≈ 3.143396 [0:00:00.014025 (Raw time: 0.014025000000003729 s)]
n = 10,000,000: π ≈ 3.14113 [0:00:00.123001 (Raw time: 0.12300090000002228 s)]
n = 100,000,000: π ≈ 3.1412414 [0:00:00.804258 (Raw time: 0.8042575999999713 s)]
n = 500,000,000: π ≈ 3.141718144 [0:00:04.104100 (Raw time: 4.104099899999994 s)]

Using non-parallel count_points_in_circle
n = 10: π ≈ 2.8 [0:00:00.000072 (Raw time: 7.189999996626284e-05 s)]
n = 10: π ≈ 3.2 [0:00:00.000023 (Raw time: 2.3100000021258893e-05 s)]
n = 100: π ≈ 3.24 [0:00:00.000019 (Raw time: 1.86000000326203e-05 s)]
n = 1,000: π ≈ 3.124 [0:00:00.000037 (Raw time: 3.739999999652355e-05 s)]
n = 10,000: π ≈ 3.1264 [0:00:00.000120 (Raw time: 0.00012040000001434237 s)]
n = 100,000: π ≈ 3.14256 [0:00:00.001055 (Raw time: 0.0010548999999855369 s)]
n = 1,000,000: π ≈ 3.141884 [0:00:00.010567 (Raw time: 0.010566699999969842 s)]
n = 10,000,000: π ≈ 3.1413664 [0:00:00.107006 (Raw time: 0.10700550000001385 s)]
n = 100,000,000: π ≈ 3.14188264 [0:00:00.865470 (Raw time: 0.8654702999999699 s)]
n = 500,000,000: π ≈ 3.141582376 [0:00:04.014441 (Raw time: 4.01444140000001 s)]

Improved version:
n = 10: π ≈ 2.8 [0:00:00.000067 (Raw time: 6.719999998949788e-05 s)]
n = 10: π ≈ 2.4 [0:00:00.000016 (Raw time: 1.550000001770968e-05 s)]
n = 100: π ≈ 3.24 [0:00:00.000029 (Raw time: 2.8799999995499093e-05 s)]
n = 1,000: π ≈ 3.192 [0:00:00.000022 (Raw time: 2.1799999956328975e-05 s)]
n = 10,000: π ≈ 3.172 [0:00:00.000185 (Raw time: 0.00018489999996518236 s)]
n = 100,000: π ≈ 3.14124 [0:00:00.001362 (Raw time: 0.0013624999999706233 s)]
n = 1,000,000: π ≈ 3.143404 [0:00:00.013065 (Raw time: 0.013065499999981967 s)]
n = 10,000,000: π ≈ 3.1418088 [0:00:00.112366 (Raw time: 0.11236619999999675 s)]
n = 100,000,000: π ≈ 3.141952 [0:00:00.682029 (Raw time: 0.6820288000000119 s)]
n = 500,000,000: π ≈ 3.141576848 [0:00:03.210755 (Raw time: 3.210754800000018 s)]

Comparison:
n = 10: improved ≤ non-parallel ≤ parallel
n = 10: improved ≤ non-parallel ≤ parallel
n = 100: non-parallel ≤ improved ≤ parallel
n = 1,000: improved ≤ non-parallel ≤ parallel
n = 10,000: non-parallel ≤ improved ≤ parallel
n = 100,000: non-parallel ≤ improved ≤ parallel
n = 1,000,000: non-parallel ≤ improved ≤ parallel
n = 10,000,000: non-parallel ≤ improved ≤ parallel
n = 100,000,000: improved ≤ parallel ≤ non-parallel
n = 500,000,000: improved ≤ non-parallel ≤ parallel

Результаты без предварительного распределения:
Using JIT compilation without caching
Not using preallocated buffers

Using parallel count_points_in_circle
n = 10: π ≈ 3.2 [0:00:00.003375 (Raw time: 0.0033753000000160682 s)]
n = 10: π ≈ 3.2 [0:00:00.000062 (Raw time: 6.170000006022747e-05 s)]
n = 100: π ≈ 3.2 [0:00:00.000059 (Raw time: 5.86999999541149e-05 s)]
n = 1,000: π ≈ 3.112 [0:00:00.000099 (Raw time: 9.939999995367543e-05 s)]
n = 10,000: π ≈ 3.1276 [0:00:00.000183 (Raw time: 0.00018330000000332802 s)]
n = 100,000: π ≈ 3.13956 [0:00:00.001689 (Raw time: 0.0016891000000214262 s)]
n = 1,000,000: π ≈ 3.142456 [0:00:00.015140 (Raw time: 0.015140099999939594 s)]
n = 10,000,000: π ≈ 3.1418444 [0:00:00.128062 (Raw time: 0.1280623000000105 s)]
n = 100,000,000: π ≈ 3.14139292 [0:00:00.831049 (Raw time: 0.8310494999999491 s)]
n = 500,000,000: π ≈ 3.141657016 [0:00:04.522461 (Raw time: 4.522460500000079 s)]

Using non-parallel count_points_in_circle
n = 10: π ≈ 3.2 [0:00:00.323710 (Raw time: 0.3237104999999474 s)]
n = 10: π ≈ 2.8 [0:00:00.000035 (Raw time: 3.4599999935380765e-05 s)]
n = 100: π ≈ 3.24 [0:00:00.000022 (Raw time: 2.1899999978813867e-05 s)]
n = 1,000: π ≈ 3.14 [0:00:00.000044 (Raw time: 4.419999993388046e-05 s)]
n = 10,000: π ≈ 3.1244 [0:00:00.000150 (Raw time: 0.00014989999999670545 s)]
n = 100,000: π ≈ 3.13744 [0:00:00.000897 (Raw time: 0.0008967999999640597 s)]
n = 1,000,000: π ≈ 3.143708 [0:00:00.008511 (Raw time: 0.008510500000056709 s)]
n = 10,000,000: π ≈ 3.1406824 [0:00:00.084274 (Raw time: 0.08427370000003975 s)]
n = 100,000,000: π ≈ 3.14154872 [0:00:00.902473 (Raw time: 0.9024734999999282 s)]
n = 500,000,000: π ≈ 3.141605384 [0:00:04.363011 (Raw time: 4.363010799999984 s)]

Improved version:
n = 10: π ≈ 3.2 [0:00:00.407473 (Raw time: 0.40747319999991305 s)]
n = 10: π ≈ 2.8 [0:00:00.000034 (Raw time: 3.4199999959128036e-05 s)]
n = 100: π ≈ 3.16 [0:00:00.000019 (Raw time: 1.9299999962640868e-05 s)]
n = 1,000: π ≈ 3.184 [0:00:00.000021 (Raw time: 2.0999999946980097e-05 s)]
n = 10,000: π ≈ 3.1388 [0:00:00.000233 (Raw time: 0.0002328000000488828 s)]
n = 100,000: π ≈ 3.13748 [0:00:00.001424 (Raw time: 0.0014244999999846186 s)]
n = 1,000,000: π ≈ 3.140832 [0:00:00.015200 (Raw time: 0.015200499999991735 s)]
n = 10,000,000: π ≈ 3.1420484 [0:00:00.131624 (Raw time: 0.13162439999996423 s)]
n = 100,000,000: π ≈ 3.14133648 [0:00:00.913009 (Raw time: 0.9130087999999432 s)]
n = 500,000,000: π ≈ 3.141633632 [0:00:04.001366 (Raw time: 4.001365899999996 s)]

Comparison:
n = 10: parallel ≤ non-parallel ≤ improved
n = 10: improved ≤ non-parallel ≤ parallel
n = 100: improved ≤ non-parallel ≤ parallel
n = 1,000: improved ≤ non-parallel ≤ parallel
n = 10,000: non-parallel ≤ parallel ≤ improved
n = 100,000: non-parallel ≤ improved ≤ parallel
n = 1,000,000: non-parallel ≤ parallel ≤ improved
n = 10,000,000: non-parallel ≤ parallel ≤ improved
n = 100,000,000: parallel ≤ non-parallel ≤ improved
n = 500,000,000: improved ≤ non-parallel ≤ parallel

Подробнее здесь: https://stackoverflow.com/questions/783 ... simulation

Реклама

1719804054

Anonymous

Это дополнительный вопрос к вопросу, который я задавал ранее, но я думаю, что мне следует начать все сначала. Я пытаюсь реализовать симуляцию числа Пи Монте-Карло и использую numba для повышения производительности. Поскольку каждая итерация цикла независима от других, я подумал, что смогу повысить производительность с помощью Parallel=True и numba.prange. Я попробовал и понял, что для небольших значений n распараллеливание того не стоит. Я попробовал улучшенную версию, в которой я использую распараллеливание после пересечения определенного порога для n, но обнаружил, что большую часть времени она работает хуже, чем мои предыдущие попытки.  Теперь у меня есть сжатие трех версий алгоритма: обычная без распараллеливания, параллельная версия с использованием numba.prange и «улучшенная» гибридная версия, использующая распараллеливание после того, как указанный порог для n равен перечеркнуто:
from datetime import timedelta
from time import perf_counter

import numba as nb
import numpy as np
import numpy.typing as npt

jit_opts = dict(
nopython=True, nogil=True, cache=False, error_model="numpy", fastmath=True
)

rng = np.random.default_rng()

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=True,
)
def count_points_in_circle_parallel(
points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi_parallel(
n: int,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = count_points_in_circle_parallel(points)
return points, in_circle, 4 * count / n

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=False,
)
def count_points_in_circle(points: npt.NDArray[float]) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in range(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi(n: int) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = count_points_in_circle(points)
return points, in_circle, 4 * count / n

def count_points_in_circle_improved(
points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

count_points_in_circle_improved_parallel = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=True,
)(count_points_in_circle_improved)
count_points_in_circle_improved = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=False,
)(count_points_in_circle_improved)

def monte_carlo_pi_improved(
n: int, parallel_threshold: int = 1000
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = (
count_points_in_circle_improved_parallel(points)
if n > parallel_threshold
else count_points_in_circle_improved(points)
)
return points, in_circle, 4 * count / n

def main() ->  None:
n_values = 10 ** np.arange(1, 9)
n_values = np.concatenate(
([10], n_values)
)  # Duplicate 10 to avoid startup overhead
time_results = np.empty((len(n_values), 3), dtype=np.float64)

if jit_opts.get("cache", False):
print("Using cached JIT compilation")
else:
print("Using JIT compilation without caching")
print()

print("Using parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi_parallel(n)
end = perf_counter()
duration = end - start
time_results[i, 0] = duration
delta = timedelta(seconds=duration)
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Using non-parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 1] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Improved version:")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi_improved(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 2] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Comparison:")
result_types = ("parallel", "non-parallel", "improved")
for n, res in zip(n_values, time_results):
res_idx = np.argsort(res)
print(
f"n = {n:,}:".ljust(20),
f"{result_types[res_idx[0]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[1]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[2]]}",
)

if __name__ == "__main__":
main()

(Знаю-знаю, этот код не очень чистый и имеет повторы, но он предназначен для целей тестирования, и в итоге у меня получится один из алгоритмов) . Я попробовал запустить его с кэшем=True и кэш=False, чтобы проверить, помогает ли это в чем-то, но результаты оказались очень запутанными. Похоже, что иногда непараллельная версия работает быстрее даже при больших значениях n, а гибридная версия на самом деле ничего не улучшает. Вот пример результатов, которые я получаю:
[img]https://i.sstatic.net/FZHDZ.png[/img]
Эти результаты очень запутанны и непоследовательны. В другом прогоне я понял, что непараллельная версия быстрее, а в другом — что параллельная версия быстрее. Похоже, я делаю что-то не так, но я не могу понять, что происходит. Почему я не вижу последовательного улучшения производительности в параллельной версии, особенно для больших значений n, и почему мой гибридный подход в большинстве случаев не улучшает производительность? Будем признательны за любое понимание того, что здесь происходит.
Изменить:
Следуя ответу @Jerome Richard, я изменил код, чтобы предварительно выделить буферы и повторно использовать их для всех моих тестов. Результаты по-прежнему кажутся мне странными: параллельная версия большую часть времени работает хуже всего, даже для больших n.  Я даже включил n = 500 000 000, чтобы еще больше расширить границы (очевидно, мой компьютер не может обработать 1 000 000 000, поэтому пришлось сократить его вдвое), но параллельная версия по-прежнему работает хуже.  Почему я не вижу каких-либо существенных улучшений в параллельной или гибридной версии алгоритма?
Измененный код:
from datetime import timedelta
from time import perf_counter

import numba as nb
import numpy as np
import numpy.typing as npt

jit_opts = dict(
nopython=True, nogil=True, cache=False, error_model="numpy", fastmath=True
)

rng = np.random.default_rng()

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=True,
)
def count_points_in_circle_parallel(
points: npt.NDArray[float], in_circle: npt.NDArray[bool]
) -> tuple[npt.NDArray[bool], int]:
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi_parallel(
n: int,
out: npt.NDArray[float] | None = None,
in_circle_out: npt.NDArray[bool] | None = None,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2), out=out)
if in_circle_out is None:
in_circle_out = np.empty(n, dtype=np.bool_)
in_circle, count = count_points_in_circle_parallel(points, in_circle_out)
return points, in_circle, 4 * count / n

@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=False,
)
def count_points_in_circle(
points: npt.NDArray[float], in_circle: npt.NDArray[bool]
) -> tuple[npt.NDArray[bool], int]:
in_circle_count = 0
for i in range(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

def monte_carlo_pi(
n: int,
out: npt.NDArray[float] | None = None,
in_circle_out: npt.NDArray[bool] | None = None,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2), out=out)
if in_circle_out is None:
in_circle_out = np.empty(n, dtype=np.bool_)
in_circle, count = count_points_in_circle(points, in_circle_out)
return points, in_circle, 4 * count / n

def count_points_in_circle_improved(
points: npt.NDArray[float], in_circle: npt.NDArray[bool]
) -> tuple[npt.NDArray[bool], int]:
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count

count_points_in_circle_improved_parallel = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=True,
)(count_points_in_circle_improved)
count_points_in_circle_improved = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :], nb.bool_[:]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :], nb.bool_[:]),
],
**jit_opts,
parallel=False,
)(count_points_in_circle_improved)

def monte_carlo_pi_improved(
n: int,
parallel_threshold: int = 1000,
out: npt.NDArray[float] | None = None,
in_circle_out: npt.NDArray[bool] | None = None,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2), out=out)
if in_circle_out is None:
in_circle_out = np.empty(n, dtype=np.bool_)
in_circle, count = (
count_points_in_circle_improved_parallel(points, in_circle_out)
if n > parallel_threshold
else count_points_in_circle_improved(points, in_circle_out)
)
return points, in_circle, 4 * count / n

def main() ->  None:
n_values = 10 ** np.arange(1, 9)
n_values = np.concatenate(
([10], n_values, [500_000_000])
)  # Duplicate 10 to avoid startup overhead
n_max = n_values.max()
buffer = np.empty((n_max, 2), dtype=np.float64)
in_circle_buffer = np.empty(n_max, dtype=np.bool_)
use_preallocated_buffer = False
time_results = np.empty((len(n_values), 3), dtype=np.float64)

if jit_opts.get("cache", False):
print("Using cached JIT compilation")
else:
print("Using JIT compilation without caching")
if use_preallocated_buffer:
print("Using preallocated buffers")
else:
print("Not using preallocated buffers")
print()

print("Using parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
if use_preallocated_buffer:
points, in_circle, pi_approx = monte_carlo_pi_parallel(
n, out=buffer[:n], in_circle_out=in_circle_buffer[:n]
)
else:
points, in_circle, pi_approx = monte_carlo_pi_parallel(n)
end = perf_counter()
duration = end - start
time_results[i, 0] = duration
delta = timedelta(seconds=duration)
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Using non-parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
if use_preallocated_buffer:
points, in_circle, pi_approx = monte_carlo_pi(
n, out=buffer[:n], in_circle_out=in_circle_buffer[:n]
)
else:
points, in_circle, pi_approx = monte_carlo_pi(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 1] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Improved version:")
for i, n in enumerate(n_values):
start = perf_counter()
if use_preallocated_buffer:
points, in_circle, pi_approx = monte_carlo_pi_improved(
n, out=buffer[:n], in_circle_out=in_circle_buffer[:n]
)
else:
points, in_circle, pi_approx = monte_carlo_pi_improved(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 2] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"\N{GREEK SMALL LETTER PI} \N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)

print()
print("Comparison:")
result_types = ("parallel", "non-parallel", "improved")
for n, res in zip(n_values, time_results):
res_idx = np.argsort(res)
print(
f"n = {n:,}:".ljust(20),
f"{result_types[res_idx[0]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[1]]} \N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[2]]}",
)

if __name__ == "__main__":
main()

Результаты предварительного распределения:
Using JIT compilation without caching
Using preallocated buffers

Using parallel count_points_in_circle
n = 10:              π ≈ 2.8              [0:00:00.018026 (Raw time: 0.018026399999996556 s)]
n = 10:              π ≈ 2.4              [0:00:00.000072 (Raw time: 7.180000000062137e-05 s)]
n = 100:              π ≈ 3.12             [0:00:00.000047 (Raw time: 4.7400000028119393e-05 s)]
n = 1,000:           π ≈ 3.208            [0:00:00.000075 (Raw time: 7.499999998117346e-05 s)]
n = 10,000:          π ≈ 3.1392           [0:00:00.000235 (Raw time: 0.00023540000000821237 s)]
n = 100,000:         π ≈ 3.14048          [0:00:00.001509 (Raw time: 0.0015089999999986503 s)]
n = 1,000,000:       π ≈ 3.143396         [0:00:00.014025 (Raw time: 0.014025000000003729 s)]
n = 10,000,000:      π ≈ 3.14113          [0:00:00.123001 (Raw time: 0.12300090000002228 s)]
n = 100,000,000:     π ≈ 3.1412414        [0:00:00.804258 (Raw time: 0.8042575999999713 s)]
n = 500,000,000:     π ≈ 3.141718144      [0:00:04.104100 (Raw time: 4.104099899999994 s)]

Using non-parallel count_points_in_circle
n = 10:              π ≈ 2.8              [0:00:00.000072 (Raw time: 7.189999996626284e-05 s)]
n = 10:              π ≈ 3.2              [0:00:00.000023 (Raw time: 2.3100000021258893e-05 s)]
n = 100:             π ≈ 3.24             [0:00:00.000019 (Raw time: 1.86000000326203e-05 s)]
n = 1,000:           π ≈ 3.124            [0:00:00.000037 (Raw time: 3.739999999652355e-05 s)]
n = 10,000:          π ≈ 3.1264           [0:00:00.000120 (Raw time: 0.00012040000001434237 s)]
n = 100,000:         π ≈ 3.14256          [0:00:00.001055 (Raw time: 0.0010548999999855369 s)]
n = 1,000,000:       π ≈ 3.141884         [0:00:00.010567 (Raw time: 0.010566699999969842 s)]
n = 10,000,000:      π ≈ 3.1413664        [0:00:00.107006 (Raw time: 0.10700550000001385 s)]
n = 100,000,000:     π ≈ 3.14188264       [0:00:00.865470 (Raw time: 0.8654702999999699 s)]
n = 500,000,000:     π ≈ 3.141582376      [0:00:04.014441 (Raw time: 4.01444140000001 s)]

Improved version:
n = 10:              π ≈ 2.8              [0:00:00.000067 (Raw time: 6.719999998949788e-05 s)]
n = 10:              π ≈ 2.4              [0:00:00.000016 (Raw time: 1.550000001770968e-05 s)]
n = 100:             π ≈ 3.24             [0:00:00.000029 (Raw time: 2.8799999995499093e-05 s)]
n = 1,000:           π ≈ 3.192            [0:00:00.000022 (Raw time: 2.1799999956328975e-05 s)]
n = 10,000:          π ≈ 3.172            [0:00:00.000185 (Raw time: 0.00018489999996518236 s)]
n = 100,000:         π ≈ 3.14124          [0:00:00.001362 (Raw time: 0.0013624999999706233 s)]
n = 1,000,000:       π ≈ 3.143404         [0:00:00.013065 (Raw time: 0.013065499999981967 s)]
n = 10,000,000:      π ≈ 3.1418088        [0:00:00.112366 (Raw time: 0.11236619999999675 s)]
n = 100,000,000:     π ≈ 3.141952         [0:00:00.682029 (Raw time: 0.6820288000000119 s)]
n = 500,000,000:     π ≈ 3.141576848      [0:00:03.210755 (Raw time: 3.210754800000018 s)]

Comparison:
n = 10:              improved ≤ non-parallel ≤ parallel
n = 10:              improved ≤ non-parallel ≤ parallel
n = 100:             non-parallel ≤ improved ≤ parallel
n = 1,000:           improved ≤ non-parallel ≤ parallel
n = 10,000:          non-parallel ≤ improved ≤ parallel
n = 100,000:         non-parallel ≤ improved ≤ parallel
n = 1,000,000:       non-parallel ≤ improved ≤ parallel
n = 10,000,000:      non-parallel ≤ improved ≤ parallel
n = 100,000,000:     improved ≤ parallel ≤ non-parallel
n = 500,000,000:     improved ≤ non-parallel ≤ parallel

Результаты без предварительного распределения:
Using JIT compilation without caching
Not using preallocated buffers

Using parallel count_points_in_circle
n = 10:              π ≈ 3.2              [0:00:00.003375 (Raw time: 0.0033753000000160682 s)]
n = 10:              π ≈ 3.2              [0:00:00.000062 (Raw time: 6.170000006022747e-05 s)]
n = 100:             π ≈ 3.2              [0:00:00.000059 (Raw time: 5.86999999541149e-05 s)]
n = 1,000:           π ≈ 3.112            [0:00:00.000099 (Raw time: 9.939999995367543e-05 s)]
n = 10,000:          π ≈ 3.1276           [0:00:00.000183 (Raw time: 0.00018330000000332802 s)]
n = 100,000:         π ≈ 3.13956          [0:00:00.001689 (Raw time: 0.0016891000000214262 s)]
n = 1,000,000:       π ≈ 3.142456         [0:00:00.015140 (Raw time: 0.015140099999939594 s)]
n = 10,000,000:      π ≈ 3.1418444        [0:00:00.128062 (Raw time: 0.1280623000000105 s)]
n = 100,000,000:     π ≈ 3.14139292       [0:00:00.831049 (Raw time: 0.8310494999999491 s)]
n = 500,000,000:     π ≈ 3.141657016      [0:00:04.522461 (Raw time: 4.522460500000079 s)]

Using non-parallel count_points_in_circle
n = 10:              π ≈ 3.2              [0:00:00.323710 (Raw time: 0.3237104999999474 s)]
n = 10:              π ≈ 2.8              [0:00:00.000035 (Raw time: 3.4599999935380765e-05 s)]
n = 100:             π ≈ 3.24             [0:00:00.000022 (Raw time:  2.1899999978813867e-05 s)]
n = 1,000:           π ≈ 3.14             [0:00:00.000044 (Raw time: 4.419999993388046e-05 s)]
n = 10,000:          π ≈ 3.1244           [0:00:00.000150 (Raw time: 0.00014989999999670545 s)]
n = 100,000:         π ≈ 3.13744          [0:00:00.000897 (Raw time: 0.0008967999999640597 s)]
n = 1,000,000:       π ≈ 3.143708         [0:00:00.008511 (Raw time: 0.008510500000056709 s)]
n = 10,000,000:      π ≈ 3.1406824        [0:00:00.084274 (Raw time: 0.08427370000003975 s)]
n = 100,000,000:     π ≈ 3.14154872       [0:00:00.902473 (Raw time: 0.9024734999999282 s)]
n = 500,000,000:     π ≈ 3.141605384      [0:00:04.363011 (Raw time: 4.363010799999984 s)]

Improved version:
n = 10:              π ≈ 3.2              [0:00:00.407473 (Raw time: 0.40747319999991305 s)]
n = 10:              π ≈ 2.8              [0:00:00.000034 (Raw time: 3.4199999959128036e-05 s)]
n = 100:             π ≈ 3.16             [0:00:00.000019 (Raw time: 1.9299999962640868e-05 s)]
n = 1,000:           π ≈ 3.184            [0:00:00.000021 (Raw time: 2.0999999946980097e-05 s)]
n = 10,000:          π ≈ 3.1388           [0:00:00.000233 (Raw time: 0.0002328000000488828 s)]
n = 100,000:         π ≈ 3.13748          [0:00:00.001424 (Raw time: 0.0014244999999846186 s)]
n = 1,000,000:       π ≈ 3.140832         [0:00:00.015200 (Raw time: 0.015200499999991735 s)]
n = 10,000,000:      π ≈ 3.1420484        [0:00:00.131624 (Raw time: 0.13162439999996423 s)]
n = 100,000,000:     π ≈ 3.14133648       [0:00:00.913009 (Raw time: 0.9130087999999432 s)]
n = 500,000,000:     π ≈ 3.141633632      [0:00:04.001366 (Raw time: 4.001365899999996 s)]

Comparison:
n = 10:              parallel ≤ non-parallel ≤ improved
n = 10:              improved ≤ non-parallel ≤ parallel
n = 100:             improved ≤ non-parallel ≤ parallel
n = 1,000:           improved ≤ non-parallel ≤ parallel
n = 10,000:          non-parallel ≤ parallel ≤ improved
n = 100,000:         non-parallel ≤ improved ≤ parallel
n = 1,000,000:       non-parallel ≤ parallel ≤ improved
n = 10,000,000:      non-parallel ≤ parallel ≤ improved
n = 100,000,000:     parallel ≤ non-parallel ≤ improved
n = 500,000,000:     improved ≤ non-parallel ≤ parallel
 

Подробнее здесь: [url]https://stackoverflow.com/questions/78372643/numba-parallelization-doesnt-help-performance-in-monte-carlo-simulation[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Как проверить равновесие в моделировании Монте-Карло?

Последнее сообщение Anonymous « 01 мар 2024, 11:53
Добавлено в форуме C#

Anonymous » 01 мар 2024, 11:53 » в форуме C#

Я написал следующий класс, чтобы проверить, находится ли симуляция Монте-Карло в равновесии.

Первая функция использует статистическую проверку ошибок

# Файл: r_end_to_end_squared_scalar.dat # Описание: Данные моделирования Монте-Карло,...

0 Ответы

24 Просмотры

Последнее сообщение Anonymous
01 мар 2024, 11:53
Монте-Карло с OpenMP

Последнее сообщение Гость « 22 сен 2023, 16:42
Добавлено в форуме C++

Гость » 22 сен 2023, 16:42 » в форуме C++

Я изучаю OpenMP, и мне нужно внедрить его в симуляцию Монте-Карло. Однако после того, как я это реализовал, затраченное время все равно не сократилось так сильно, как ожидалось, как показано на рисунке. Мой код OpenMP используется неправильно,...

0 Ответы

31 Просмотры

Последнее сообщение Гость
22 сен 2023, 16:42
Монте-Карло с OpenMP

Последнее сообщение Anonymous « 23 сен 2023, 05:34
Добавлено в форуме C++

Anonymous » 23 сен 2023, 05:34 » в форуме C++

Я изучаю OpenMP, и мне нужно внедрить его в симуляцию Монте-Карло для оценки вероятности появления последовательных королей в перетасованной колоде. Однако после того, как я это реализовал, затраченное время все равно не сократилось так сильно, как...

0 Ответы

27 Просмотры

Последнее сообщение Anonymous
23 сен 2023, 05:34
Ошибка в реализации поиска по дереву Монте-Карло.

Последнее сообщение Anonymous « 05 июл 2024, 13:24
Добавлено в форуме C++

Anonymous » 05 июл 2024, 13:24 » в форуме C++

Я работаю над движком «Крестики-нолики», используя алгоритм поиска по дереву Монте-Карло (MCTS). Однако я столкнулся с ошибкой, из-за которой ИИ иногда не может заблокировать выигрышные ходы противника, что приводит к проигрышам. Кроме того,...

0 Ответы

20 Просмотры

Последнее сообщение Anonymous
05 июл 2024, 13:24
Ошибка в реализации поиска по дереву Монте-Карло.

Последнее сообщение Anonymous « 06 июл 2024, 20:11
Добавлено в форуме C++

Anonymous » 06 июл 2024, 20:11 » в форуме C++

Я работаю над движком «Крестики-нолики», используя алгоритм поиска по дереву Монте-Карло (MCTS). Однако я столкнулся с ошибкой, из-за которой ИИ иногда не может заблокировать выигрышные ходы противника, что приводит к проигрышам. Кроме того,...

0 Ответы

27 Просмотры

Последнее сообщение Anonymous
06 июл 2024, 20:11

Вернуться в «Python»

Programmiererforum