复现问题的方法
下面是核心部分代码,test_gemm部分和相关依赖的代码,在test/test_core.py可以找到
def test_gemm() -> None:
print('Testing GEMM:')
for m in (64, 128, 4096):
for k, n in [(576, 7168), (7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]:
x_fp8, y_fp8, out, ref_out = construct(m, k, n)
deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
diff = calc_diff(out, ref_out)
assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}'
# noinspection PyShadowingNames
def test_func():
deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
print(f' > Perf (m={m:5}, n={n:5}, k={k:5}): {t * 1e6:4.0f} us | '
f'throughput: {2 * m * n * k / t / 1e12:4.0f} TFLOPS, '
f'{(m * k + k * n + m * n * 2) / 1e9 / t:4.0f} GB/s')
print()
def worker():
child_env = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"child env:{child_env}, pid:{os.getpid()}")
while True:
test_gemm()
if __name__ == '__main__':
multiprocessing.set_start_method('spawn') # 设置启动方法
old_cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
var_list = old_cuda_visible_devices.split(',')
childs = []
for local_env in var_list:
os.environ["CUDA_VISIBLE_DEVICES"] = local_env
p = multiprocessing.Process(target=worker)
p.start()
childs.append(p)
for p in childs:
p.join()
如果运行
rm -rf /tmp/deep_gemm_like /mnt/yrfs/users/like/dg_cache; DG_JIT_CACHE_DIR=/mnt/yrfs/users/like/dg_cache CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" python3 ~/package//DeepGEMM/temp/n2_mp.py
这样,在多进程环境下,deep_gemm调用os.replace的时候,无法保证原子操作。
解决方法:DG_JIT_CACHE_DIR放到本地硬盘上
export DG_JIT_CACHE_DIR=/tmp/deep_gemm_like