{"id":515,"date":"2025-03-24T16:20:59","date_gmt":"2025-03-24T08:20:59","guid":{"rendered":"https:\/\/189505.xyz\/?p=515"},"modified":"2026-05-26T10:41:08","modified_gmt":"2026-05-26T02:41:08","slug":"llm","status":"publish","type":"post","link":"https:\/\/189505.xyz\/?p=515","title":{"rendered":"llm"},"content":{"rendered":"<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_40 counter-hierarchy ez-toc-counter ez-toc-grey ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\">Table of Contents<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" area-label=\"ez-toc-toggle-icon-1\"><label for=\"item-6a28d1d5409ee\" aria-label=\"Table of Content\"><span style=\"display: flex;align-items: center;width: 35px;height: 30px;justify-content: center;direction:ltr;\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/label><input  type=\"checkbox\" id=\"item-6a28d1d5409ee\"><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/189505.xyz\/?p=515\/#vllm\" title=\"\nvllm \n\">\nvllm \n<\/a><ul class='ez-toc-list-level-2'><li class='ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/189505.xyz\/?p=515\/#offline_mode\" title=\"\noffline mode \n\">\noffline mode \n<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/189505.xyz\/?p=515\/#vllm_serve\" title=\"\nvllm serve \n\">\nvllm serve \n<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/189505.xyz\/?p=515\/#vllm_bench_serve\" title=\"vllm bench serve\">vllm bench serve<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/189505.xyz\/?p=515\/#curl\" title=\"curl\">curl<\/a><ul class='ez-toc-list-level-2'><li class='ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/189505.xyz\/?p=515\/#curl_v1chatcompletions\" title=\"curl v1\/chat\/completions\">curl v1\/chat\/completions<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/189505.xyz\/?p=515\/#%E6%BA%90%E7%A0%81%E7%BC%96%E8%AF%91vllm\" title=\"\u6e90\u7801\u7f16\u8bd1vllm\">\u6e90\u7801\u7f16\u8bd1vllm<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/189505.xyz\/?p=515\/#forward%E5%87%BD%E6%95%B0%E9%87%8C%E9%9D%A2%EF%BC%8C%E6%9C%89%E4%B8%80%E4%BA%9B%E4%B8%8D%E5%8F%AFtrace_cuda_graph%E7%9A%84%E5%87%BD%E6%95%B0\" title=\"forward\u51fd\u6570\u91cc\u9762\uff0c\u6709\u4e00\u4e9b\u4e0d\u53eftrace cuda graph\u7684\u51fd\u6570\">forward\u51fd\u6570\u91cc\u9762\uff0c\u6709\u4e00\u4e9b\u4e0d\u53eftrace cuda graph\u7684\u51fd\u6570<\/a><ul class='ez-toc-list-level-2'><li class='ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/189505.xyz\/?p=515\/#sglang\" title=\"sglang\">sglang<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/189505.xyz\/?p=515\/#vllm-2\" title=\"vllm\">vllm<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"https:\/\/189505.xyz\/?p=515\/#sglang_%E4%BD%BF%E7%94%A8cuda128_%E6%BA%90%E7%A0%81%E7%BC%96%E8%AF%91\" title=\"sglang \u4f7f\u7528cuda12.8 \u6e90\u7801\u7f16\u8bd1\">sglang \u4f7f\u7528cuda12.8 \u6e90\u7801\u7f16\u8bd1<\/a><\/li><\/ul><\/nav><\/div>\n<h1><span class=\"ez-toc-section\" id=\"vllm\"><\/span>\nvllm<br \/>\n<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<p><a href=\"https:\/\/docs.vllm.ai\/en\/latest\/getting_started\/quickstart.html\">https:\/\/docs.vllm.ai\/en\/latest\/getting_started\/quickstart.html<\/a><\/p>\n<p>\u7531\u4e8ehugging-face\u65e0\u6cd5\u8bbf\u95eequick start\u91cc\u9762\u7684\u4e0b\u9762\u4ee3\u7801\u4f1a\u62a5\u9519<\/p>\n<pre><code>llm = LLM(model=&quot;facebook\/opt-125m&quot;)<\/code><\/pre>\n<p>\u5148\u7528proxychains\u4e0b\u8f7d\uff0c\u4e0b\u8f7d\u4ee3\u7801\u5982\u4e0b(<a href=\"https:\/\/github.com\/vllm-project\/vllm\/discussions\/1405\">https:\/\/github.com\/vllm-project\/vllm\/discussions\/1405<\/a>)\uff1a<\/p>\n<pre><code>from huggingface_hub import snapshot_download\n\n#model_id=&quot;deepseek-ai\/DeepSeek-R1-Distill-Qwen-14B&quot;\nmodel_id=&quot;Qwen\/Qwen2.5-1.5B-Instruct&quot;\nmodel_path = snapshot_download(\n    repo_id=model_id,\n    local_dir=&quot;.\/models\/&quot;+model_id,\n    max_workers=4  # Increase for faster parallel downloads\n)<\/code><\/pre>\n<p>scp\u5230\u670d\u52a1\u5668<\/p>\n<h2><span class=\"ez-toc-section\" id=\"offline_mode\"><\/span>\noffline mode<br \/>\n<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<pre><code>(torch2) A|a141|2025-03-24 16:46:36[like@ vllm]cat n1_fb.py\n\nfrom vllm import LLM, SamplingParams\nprompts = [\n    &quot;Hello, my name is&quot;,\n    &quot;The president of the United States is&quot;,\n    &quot;The capital of France is&quot;,\n    &quot;The future of AI is&quot;,\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n#llm = LLM(model=&quot;facebook\/opt-125m&quot;)\n#llm = LLM(model=&quot;\/share_data\/users\/like\/hf-models\/facebook\/opt-125m&quot;)\ngpu_memory_utilization=0.013\nllm = LLM(model=&quot;\/share_data\/users\/like\/hf-models\/facebook\/opt-125m&quot;, gpu_memory_utilization=gpu_memory_utilization)\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f&quot;Prompt: {prompt!r}, Generated text: {generated_text!r}&quot;)\nprint(f&quot;enter to end,,gpu_memory_utilization:{gpu_memory_utilization}&quot;)\nx = input()\nprint(f&quot;x:{x}&quot;)<\/code><\/pre>\n<h2><span class=\"ez-toc-section\" id=\"vllm_serve\"><\/span>\nvllm serve<br \/>\n<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<pre><code>vllm serve \/share_data\/users\/like\/hf-models\/facebook\/opt-125m\/ --gpu-memory-utilization 0.2<\/code><\/pre>\n<h2><span class=\"ez-toc-section\" id=\"vllm_bench_serve\"><\/span>vllm bench serve<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p><a href=\"https:\/\/github.com\/vllm-project\/vllm\/pull\/17625\">https:\/\/github.com\/vllm-project\/vllm\/pull\/17625<\/a><\/p>\n<pre><code># Server side with triton backend (Plz use VLLM_ATTENTION_BACKEND=CUTLASS_MLA_VLLM_V1 for cutlass backend):\nVLLM_LOGGING_LEVEL=DEBUG \\\nVLLM_WORKER_MULTIPROC_METHOD=spawn \\\n  vllm serve deepseek-ai\/DeepSeek-V3 \\\n    --trust-remote-code \\\n    --max-model-len=2048 \\\n    --block-size=128 \\\n    --max-num-seqs=512 \\\n    --gpu_memory_utilization=0.97 \\\n    --data-parallel-size $NUM_GPUS --enable-expert-parallel \\\n    --disable-log-requests\n\n# client side:\npython $VLLM_PATH\/benchmarks\/benchmark_serving.py \\\n  --model deepseek-ai\/DeepSeek-V3 \\\n  --dataset-name random \\\n  --ignore-eos \\\n  --num-prompts 3000 \\\n  --max-concurrency 3000 \\\n  --random-input-len 1000 \\\n  --random-output-len 1<\/code><\/pre>\n<p>\u7ed3\u679c<\/p>\n<pre><code># With default triton backend:\n============ Serving Benchmark Result ============\nSuccessful requests:                     2989\nBenchmark duration (s):                  1046.01\nTotal input tokens:                      2989000\nTotal generated tokens:                  2989000\nRequest throughput (req\/s):              2.86\nOutput token throughput (tok\/s):         2857.52\nTotal Token throughput (tok\/s):          5715.04\n---------------Time to First Token----------------\nMean TTFT (ms):                          200716.51\nMedian TTFT (ms):                        199463.35\nP99 TTFT (ms):                           395239.25\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms):                          826.04\nMedian TPOT (ms):                        826.20\nP99 TPOT (ms):                           1001.39\n---------------Inter-token Latency----------------\nMean ITL (ms):                           826.04\nMedian ITL (ms):                         648.89\nP99 ITL (ms):                            8337.69\n==================================================\n\nWith cutlass_mla backend:\n============ Serving Benchmark Result ============\nSuccessful requests:                     2989\nBenchmark duration (s):                  881.52\nTotal input tokens:                      2989000\nTotal generated tokens:                  2989000\nRequest throughput (req\/s):              3.39\nOutput token throughput (tok\/s):         3390.73\nTotal Token throughput (tok\/s):          6781.46\n---------------Time to First Token----------------\nMean TTFT (ms):                          190244.11\nMedian TTFT (ms):                        189563.96\nP99 TTFT (ms):                           372713.07\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms):                          685.60\nMedian TPOT (ms):                        686.96\nP99 TPOT (ms):                           858.01\n---------------Inter-token Latency----------------\nMean ITL (ms):                           685.60\nMedian ITL (ms):                         518.56\nP99 ITL (ms):                            7738.23\n==================================================\nTo repro the results:<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"curl\"><\/span>curl<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<h2><span class=\"ez-toc-section\" id=\"curl_v1chatcompletions\"><\/span>curl v1\/chat\/completions<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<p>sglang\u542f\u52a8\u547d\u4ee4\uff1a<\/p>\n<pre><code>CUDA_VISIBLE_DEVICES=0,1 python3 -m sglang.launch_server --model-path \/mnt\/yrfs\/llm_weights\/Meta-Llama-3.1-8B-Instruct\/ --quantization fp8 --port 30000 --host 0.0.0.0 --tp-size 2 &gt; ~\/package\/\/sglang_kernel_src\/temp\/sglang-server.log 2&gt;&amp;1 &amp;<\/code><\/pre>\n<p>\u5ba2\u6237\u7aef\u547d\u4ee4\uff1a<\/p>\n<pre><code>curl http:\/\/localhost:30000\/v1\/chat\/completions  -H &quot;Content-Type: application\/json&quot;  -d &#039;{ &quot;model&quot;: &quot;\/mnt\/yrfs\/llm_weights\/Meta-Llama-3.1-8B-Instruct\/&quot;, &quot;messages&quot;: [ {&quot;role&quot;: &quot;system&quot;, &quot;content&quot;: &quot;You are a helpful assistant.&quot;}, {&quot;role&quot;: &quot;user&quot;, &quot;content&quot;: &quot;what is rust?&quot;} ] }&#039;<\/code><\/pre>\n<p>vllm\u63a8\u8350\u7684\u547d\u4ee4<br \/>\n<a href=\"https:\/\/docs.vllm.ai\/en\/latest\/getting_started\/quickstart\/#openai-compatible-server\">https:\/\/docs.vllm.ai\/en\/latest\/getting_started\/quickstart\/#openai-compatible-server<\/a><\/p>\n<pre><code>curl http:\/\/localhost:30000\/v1\/completions  -H &quot;Content-Type: application\/json&quot;  -d &#039;{ &quot;model&quot;: &quot;\/mnt\/yrfs\/llm_weights\/Meta-Llama-3.1-8B-Instruct\/&quot;, &quot;prompt&quot;: &quot;San Francisco is a&quot;, &quot;max_tokens&quot;: 20, &quot;temperature&quot;: 0 }&#039;<\/code><\/pre>\n<p>v1\/chat\/completion\u4e5f\u53ef\u4ee5\u8fd9\u6837\u5199<\/p>\n<pre><code>curl http:\/\/localhost:30000\/v1\/chat\/completions  -H &quot;Content-Type: application\/json&quot;  -d &#039;{ &quot;model&quot;: &quot;\/mnt\/yrfs\/llm_weights\/Meta-Llama-3.1-8B-Instruct\/&quot;, &quot;messages&quot;: [ {&quot;role&quot;: &quot;system&quot;, &quot;content&quot;: &quot;You are a helpful assistant.&quot;}, {&quot;role&quot;: &quot;user&quot;, &quot;content&quot;: &quot;Who won the world series in 2020?&quot;} ] }&#039; \n\n\u7ed3\u679c\uff1a                                                                {&quot;id&quot;:&quot;2c2b992d86f344cfb04467baa08df3a9&quot;,&quot;object&quot;:&quot;chat.completion&quot;,&quot;created&quot;:1764814659,&quot;model&quot;:&quot;\/mnt\/yrfs\/llm_weights\/Meta-Llama-3.1-8B-Instruct\/&quot;,&quot;choices&quot;:[{&quot;index&quot;:0,&quot;message&quot;:{&quot;role&quot;:&quot;assistant&quot;,&quot;content&quot;:&quot;The Los Angeles Dodgers won the World Series in 2020, defeating the Tampa Bay Rays in the series 4 games to 2. It was the Dodgers&#039; first World Series title since 1988.&quot;,&quot;reasoning_content&quot;:null,&quot;tool_calls&quot;:null},&quot;logprobs&quot;:null,&quot;finish_reason&quot;:&quot;stop&quot;,&quot;matched_stop&quot;:128009}],&quot;usage&quot;:{&quot;prompt_tokens&quot;:31,&quot;total_tokens&quot;:74,&quot;completion_tokens&quot;:43,&quot;prompt_tokens_details&quot;:null,&quot;reasoning_tokens&quot;:0},&quot;metadata&quot;:{&quot;weight_version&quot;:&quot;default&quot;}}<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"%E6%BA%90%E7%A0%81%E7%BC%96%E8%AF%91vllm\"><\/span>\u6e90\u7801\u7f16\u8bd1vllm<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<p>\u73af\u5883\u53d8\u91cf<\/p>\n<pre><code>export MAX_JOBS=60\n\nexport NCCL_NVLS_ENABLE=0\nexport CUDA_ROOT_DIR=\/share_data\/users\/like\/opt\/cuda-12.8\/\nexport CUDA_HOME=$CUDA_ROOT_DIR\nexport LD_LIBRARY_PATH=${CUDA_HOME}\/lib64\/:${LD_LIBRARY_PATH}\nexport PATH=${CUDA_HOME}\/bin\/:$PATH\nexport DG_JIT_CACHE_DIR=\/tmp\/deep_gemm_cache_like\nmkdir -p $DG_JIT_CACHE_DIR\nexport DG_JIT_NVCC_COMPILER=$CUDA_HOME\/bin\/nvcc\nexport TRITON_CACHE_DIR=\/tmp\/triton_cache_like\nmkdir -p $TRITON_CACHE_DIR\n# for pip install\nexport NVCC_VERBOSE=1\nexport CUDA_VERBOSE_BUILD=1\nexport CMAKE_VERBOSE_MAKEFILE=ON\n# for scikit-build-core verbose build\nexport SKBUILD_VERBOSE=1\n\nexport VLLM_CMAKE_CUSTOM_BUILD_DIR=build-src-bjh-v0.13.0-like-dev-v2026-01-15<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"forward%E5%87%BD%E6%95%B0%E9%87%8C%E9%9D%A2%EF%BC%8C%E6%9C%89%E4%B8%80%E4%BA%9B%E4%B8%8D%E5%8F%AFtrace_cuda_graph%E7%9A%84%E5%87%BD%E6%95%B0\"><\/span>forward\u51fd\u6570\u91cc\u9762\uff0c\u6709\u4e00\u4e9b\u4e0d\u53eftrace cuda graph\u7684\u51fd\u6570<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<h2><span class=\"ez-toc-section\" id=\"sglang\"><\/span>sglang<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<pre><code>+      self._debug_moe_apply_count = 0\n+    _in_capture = torch.cuda.is_current_stream_capturing()<\/code><\/pre>\n<h2><span class=\"ez-toc-section\" id=\"vllm-2\"><\/span>vllm<span class=\"ez-toc-section-end\"><\/span><\/h2>\n<pre><code> _in_capture = torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing()\n<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"sglang_%E4%BD%BF%E7%94%A8cuda128_%E6%BA%90%E7%A0%81%E7%BC%96%E8%AF%91\"><\/span>sglang \u4f7f\u7528cuda12.8 \u6e90\u7801\u7f16\u8bd1<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<p>env-build-pip.sh<\/p>\n<pre><code>(simo_sglang) gpu015|2026-05-26 10:32:57[like@ install-scripts-sgl] cat env-build-pip.sh\nexport NCCL_NVLS_ENABLE=0\nexport MAX_JOBS=60\nexport CUDA_ROOT_DIR=\/share_data\/users\/like\/opt\/cuda-12.8\/\nexport CUDA_HOME=$CUDA_ROOT_DIR\nexport LD_LIBRARY_PATH=${CUDA_HOME}\/lib64\/:${LD_LIBRARY_PATH}\nexport PATH=${CUDA_HOME}\/bin\/:$PATH\nexport DG_JIT_CACHE_DIR=\/tmp\/deep_gemm_cache_like\nmkdir -p $DG_JIT_CACHE_DIR\nexport DG_JIT_NVCC_COMPILER=$CUDA_HOME\/bin\/nvcc\nexport TRITON_CACHE_DIR=\/tmp\/triton_cache_like\nmkdir -p $TRITON_CACHE_DIR\n# for pip install\nexport NVCC_VERBOSE=1\nexport CUDA_VERBOSE_BUILD=1\nexport CMAKE_VERBOSE_MAKEFILE=ON\n# for scikit-build-core verbose build\nexport SKBUILD_VERBOSE=1<\/code><\/pre>\n<p>skl, \u91cd\u70b9\u4f7f\u7528--index-url <a href=\"https:\/\/download.pytorch.org\/whl\/cu128\">https:\/\/download.pytorch.org\/whl\/cu128<\/a> \uff0c\u4e0b\u8f7d\u7684torch\u662ffrom cu128<\/p>\n<pre><code>(simo_sglang) gpu015|2026-05-26 10:33:25[like@ install-scripts-sgl] cat install-skl-src.sh\nset -x\nsource install-scripts-sgl\/env-build-pip.sh\n#pip install --config-settings=build.verbose=true -vvv -e &quot;sgl-kernel&quot; --no-build-isolation &gt; temp\/pip-sgl-kernel.log.main-local-dep.txt 2&gt;&amp;1 &amp;\npip install --config-settings=build.verbose=true -vvv -e &quot;sgl-kernel&quot; --no-build-isolation --index-url https:\/\/download.pytorch.org\/whl\/cu128  --extra-index-url https:\/\/pypi.tuna.tsinghua.edu.cn\/simple &gt; temp\/pip-sgl-kernel.log.main-local-dep.txt 2&gt;&amp;1 &amp;\n###\npip show  torch\nName: torch\nVersion: 2.11.0+cu128\nSummary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\nHome-page: https:\/\/pytorch.org\nAuthor:\nAuthor-email: PyTorch Team &lt;packages@pytorch.org&gt;\nLicense: BSD-3-Clause\nLocation: \/data\/like\/miniconda3\/envs\/simo_sglang\/lib\/python3.12\/site-packages\nRequires: cuda-bindings, cuda-toolkit, filelock, fsspec, jinja2, networkx, nvidia-cudnn-cu12, nvidia-cusparselt-cu12, nvidia-nccl-cu12, nvidia-nvshmem-cu12, setuptools, sympy, triton, typing-extensions\nRequired-by: accelerate, compressed-tensors, flash-attn-4, flashinfer-python, outlines, peft, quack-kernels, sglang, simo, tilelang, timm, tokenspeed-mla, torch_c_dlpack_ext, torchvision, xgrammar<\/code><\/pre>\n<p>python code<\/p>\n<pre><code>cat install-sglang.sh\nset -x\nLOG=temp\/pip-sglang-log.main-local-dep.txt\nsource install-scripts-sgl\/env-build-pip.sh\npip install --config-settings=build.verbose=true -vvv -e &quot;python&quot; --no-build-isolation  --index-url https:\/\/download.pytorch.org\/whl\/cu128  --extra-index-url https:\/\/pypi.tuna.tsinghua.edu.cn\/simple &gt; $LOG 2&gt;&amp;1 &amp;<\/code><\/pre>\n<p>\u8fd8\u6709deep_gemm\uff0csglang pyproject.tom, \u5b83\u4f9d\u8d56\u7684\u662fsgl-deep-gemm\uff0csglang\u4e0a\u4f20\u7684whl\u662fcuda 12.9 \u7f16\u8bd1\u7684\u3002\u6211\u4eec\u8981\u628a\u5b83\u5378\u8f7d\uff0c\u518d\u4f7f\u7528install.sh \u5b89\u88c5<\/p>\n","protected":false},"excerpt":{"rendered":"<p>vllm https:\/\/docs.vllm.ai\/en\/latest\/getting_started\/qui &#8230; <a title=\"llm\" class=\"read-more\" href=\"https:\/\/189505.xyz\/?p=515\" aria-label=\"More on llm\">Read more<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"_links":{"self":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts\/515"}],"collection":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=515"}],"version-history":[{"count":15,"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts\/515\/revisions"}],"predecessor-version":[{"id":690,"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts\/515\/revisions\/690"}],"wp:attachment":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=515"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=515"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=515"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}