复现问题的方法

下面是核心部分代码，test_gemm部分和相关依赖的代码，在test/test_core.py可以找到

def test_gemm() -> None:
    print('Testing GEMM:')
    for m in (64, 128, 4096):
        for k, n in [(576, 7168), (7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]:
            x_fp8, y_fp8, out, ref_out = construct(m, k, n)
            deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
            diff = calc_diff(out, ref_out)
            assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}'

            # noinspection PyShadowingNames
            def test_func():
                deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)

            t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
            print(f' > Perf (m={m:5}, n={n:5}, k={k:5}): {t * 1e6:4.0f} us | '
                  f'throughput: {2 * m * n * k / t / 1e12:4.0f} TFLOPS, '
                  f'{(m * k + k * n + m * n * 2) / 1e9 / t:4.0f} GB/s')
    print()

def worker():
    child_env = os.environ["CUDA_VISIBLE_DEVICES"]
    print(f"child env:{child_env}, pid:{os.getpid()}")
    while True:
        test_gemm()

if __name__ == '__main__':
    multiprocessing.set_start_method('spawn')  # 设置启动方法
    old_cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
    var_list = old_cuda_visible_devices.split(',')
    childs = []
    for local_env in var_list:
        os.environ["CUDA_VISIBLE_DEVICES"] = local_env
        p = multiprocessing.Process(target=worker)
        p.start()
        childs.append(p)
    for p in childs:
        p.join()

如果运行

rm -rf /tmp/deep_gemm_like /mnt/yrfs/users/like/dg_cache; DG_JIT_CACHE_DIR=/mnt/yrfs/users/like/dg_cache CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" python3 ~/package//DeepGEMM/temp/n2_mp.py

这样，在多进程环境下，deep_gemm调用os.replace的时候，无法保证原子操作。
解决方法：DG_JIT_CACHE_DIR放到本地硬盘上

export DG_JIT_CACHE_DIR=/tmp/deep_gemm_like

os.replace在nfs上不是atomic的

复现问题的方法

Leave a Comment 取消回复