跑通Visual-RFT报错解决记录
创建环境
(base) ➜ ~ cd work
(base) ➜ work ls
1 2 envs
(base) ➜ work mkdir 3
(base) ➜ work cd 3
(base) ➜ 3 git clone https://github.com/Liuziyu77/Visual-RFT.git
Cloning into 'Visual-RFT'...
remote: Enumerating objects: 696, done.
remote: Counting objects: 100% (265/265), done.
remote: Compressing objects: 100% (106/106), done.
remote: Total 696 (delta 233), reused 159 (delta 159), pack-reused 431 (from 1)
Receiving objects: 100% (696/696), 20.40 MiB | 16.00 MiB/s, done.
Resolving deltas: 100% (350/350), done.
(base) ➜ 3 ls
Visual-RFT
(base) ➜ 3 cd Visual-RFT
(base) ➜ Visual-RFT pwd
/home/featurize/work/3/Visual-RFT
(base) ➜ Visual-RFT conda create --prefix /cloud/3/Visual-RFT/envs python=3.10
# To activate this environment, use
#
# $ conda activate /cloud/3/Visual-RFT/envs
#
# To deactivate an active environment, use
#
# $ conda deactivate (base) ➜ Visual-RFT conda activate /cloud/3/Visual-RFT/envs
1. bash setup.sh
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT bash setup.sh
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl (16 kB)
INFO: pip is looking at multiple versions of transformers to determine which version is compatible with other requirements. This could take a while.
DEPRECATION: git+https://github.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math] contains an egg fragment with a non-PEP 508 name. pip 25.3 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/13157
ERROR: Cannot install None, open-r1 and open-r1[dev]==0.1.0.dev0 because these package versions have conflicting dependencies.The conflict is caused by:open-r1 0.1.0.dev0 depends on huggingface-hub<1.0 and >=0.19.2open-r1[dev] 0.1.0.dev0 depends on huggingface-hub<1.0 and >=0.19.2lighteval 0.6.0.dev0 depends on huggingface_hub>=0.23.0transformers 5.0.0.dev0 depends on huggingface-hub<2.0 and >=1.0.0To fix this you could try to:
1. loosen the range of package versions you've specified
2. remove package versions to allow pip to attempt to solve the dependency conflictERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting wandb==0.18.3Downloading https://pypi.tuna.tsinghua.edu.cn/packages/86/a6/11eaa16c96469b4d6fc0fb3271e70d5bbe2c3a93c15fc677de9a1aa4374a/wandb-0.18.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 136.5 MB/s 0:00:00
Collecting click!=8.0.0,>=7.1 (from wandb==0.18.3)Downloading https://pypi.tuna.tsinghua.edu.cn/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl (107 kB)
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting flash-attnDownloading https://pypi.tuna.tsinghua.edu.cn/packages/3b/b2/8d76c41ad7974ee264754709c22963447f7f8134613fd9ce80984ed0dab7/flash_attn-2.8.3.tar.gz (8.4 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 116.6 MB/s 0:00:00Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [19 lines of output]/tmp/pip-install-zjjsurp1/flash-attn_1fad669fb2c04511b2ec399012c08882/setup.py:106: UserWarning: flash_attn was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.warnings.warn(Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-zjjsurp1/flash-attn_1fad669fb2c04511b2ec399012c08882/setup.py", line 227, in <module>CUDAExtension(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1347, in CUDAExtensionlibrary_dirs += library_paths(device_type="cuda")File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1559, in library_pathsif (not os.path.exists(_join_cuda_home(lib_dir)) andFile "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 2986, in _join_cuda_homeraise OSError('CUDA_HOME environment variable is not set. 'OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.torch.__version__ = 2.9.0+cu128[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting vllm==0.7.2Downloading https://pypi.tuna.tsinghua.edu.cn/packages/e7/c0/5b7f019aa798dedfb44c30971e9becf3c6a2db7dde311570178fa66c49c8/vllm-0.7.2-cp38-abi3-manylinux1_x86_64.whl (264.3 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 264.3/264.3 MB 162.0 MB/s 0:00:01
Requirement already satisfied: psutil in /home/featurize/work/3/Visual-RFT/envs/lib/python3.10/site-packages (from vllm==0.7.2) (7.1.3)
2. bash setup.sh
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT bash setup.sh
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/34/c5/dfa824defc1919289a00cbb722aee92cee85ce1e11f7349779235dbe4810/deepspeed-0.15.4.tar.gz (1.4 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 70.5 MB/s 0:00:00Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [21 lines of output][2025-11-06 02:28:33,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)df: /home/featurize/.triton/autotune: No such file or directory[2025-11-06 02:28:36,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/setup.py", line 40, in <module>from op_builder import get_default_compute_capabilities, OpBuilderFile "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/op_builder/__init__.py", line 18, in <module>import deepspeed.ops.op_builder # noqa: F401 # type: ignoreFile "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/deepspeed/__init__.py", line 25, in <module>from . import opsFile "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/deepspeed/ops/__init__.py", line 15, in <module>from ..git_version_info import compatible_ops as __compatible_ops__File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/deepspeed/git_version_info.py", line 29, in <module>op_compatible = builder.is_compatible()File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/op_builder/fp_quantizer.py", line 35, in is_compatiblesys_cuda_major, _ = installed_cuda_version()File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/op_builder/builder.py", line 51, in installed_cuda_versionraise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")op_builder.builder.MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
3. bash setup.sh
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting flash-attnDownloading https://pypi.tuna.tsinghua.edu.cn/packages/3b/b2/8d76c41ad7974ee264754709c22963447f7f8134613fd9ce80984ed0dab7/flash_attn-2.8.3.tar.gz (8.4 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 123.6 MB/s 0:00:00Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [19 lines of output]/tmp/pip-install-71xox1np/flash-attn_0f615b83eff84908a86cac9373a8c34f/setup.py:106: UserWarning: flash_attn was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.warnings.warn(Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-71xox1np/flash-attn_0f615b83eff84908a86cac9373a8c34f/setup.py", line 227, in <module>CUDAExtension(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1078, in CUDAExtensionlibrary_dirs += library_paths(cuda=True)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1209, in library_pathsif (not os.path.exists(_join_cuda_home(lib_dir)) andFile "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 2416, in _join_cuda_homeraise OSError('CUDA_HOME environment variable is not set. 'OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.torch.__version__ = 2.5.1+cu124[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
4. bash setup.sh
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT bash setup.sh
Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [21 lines of output][2025-11-06 02:28:33,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)df: /home/featurize/.triton/autotune: No such file or directory[2025-11-06 02:28:36,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/setup.py", line 40, in <module>from op_builder import get_default_compute_capabilities, OpBuilderFile "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/op_builder/__init__.py", line 18, in <module>import deepspeed.ops.op_builder # noqa: F401 # type: ignoreFile "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/deepspeed/__init__.py", line 25, in <module>from . import opsFile "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/deepspeed/ops/__init__.py", line 15, in <module>from ..git_version_info import compatible_ops as __compatible_ops__File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/deepspeed/git_version_info.py", line 29, in <module>op_compatible = builder.is_compatible()File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/op_builder/fp_quantizer.py", line 35, in is_compatiblesys_cuda_major, _ = installed_cuda_version()File "/tmp/pip-install-e7n_ets4/deepspeed_43725ae95b8d49759b811c87cd9d3f71/op_builder/builder.py", line 51, in installed_cuda_versionraise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")op_builder.builder.MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
5. bash setup.sh
Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [20 lines of output][2025-11-06 02:49:04,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)[2025-11-06 02:49:06,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/setup.py", line 40, in <module>from op_builder import get_default_compute_capabilities, OpBuilderFile "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/op_builder/__init__.py", line 18, in <module>import deepspeed.ops.op_builder # noqa: F401 # type: ignoreFile "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/deepspeed/__init__.py", line 25, in <module>from . import opsFile "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/deepspeed/ops/__init__.py", line 15, in <module>from ..git_version_info import compatible_ops as __compatible_ops__File "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/deepspeed/git_version_info.py", line 29, in <module>op_compatible = builder.is_compatible()File "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/op_builder/fp_quantizer.py", line 35, in is_compatiblesys_cuda_major, _ = installed_cuda_version()File "/tmp/pip-install-gd2_t87x/deepspeed_dbe6290346184f95aab25e273f474cee/op_builder/builder.py", line 51, in installed_cuda_versionraise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")op_builder.builder.MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting flash-attnDownloading https://pypi.tuna.tsinghua.edu.cn/packages/3b/b2/8d76c41ad7974ee264754709c22963447f7f8134613fd9ce80984ed0dab7/flash_attn-2.8.3.tar.gz (8.4 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 111.5 MB/s 0:00:00Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [19 lines of output]/tmp/pip-install-vnghtaia/flash-attn_70e465216127476ab25c4887c5e9f6b7/setup.py:106: UserWarning: flash_attn was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.warnings.warn(Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-vnghtaia/flash-attn_70e465216127476ab25c4887c5e9f6b7/setup.py", line 227, in <module>CUDAExtension(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1078, in CUDAExtensionlibrary_dirs += library_paths(cuda=True)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1209, in library_pathsif (not os.path.exists(_join_cuda_home(lib_dir)) andFile "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 2416, in _join_cuda_homeraise OSError('CUDA_HOME environment variable is not set. 'OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.torch.__version__ = 2.5.1+cu124[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
6. bash setup.sh
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/34/c5/dfa824defc1919289a00cbb722aee92cee85ce1e11f7349779235dbe4810/deepspeed-0.15.4.tar.gz (1.4 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 70.5 MB/s 0:00:00Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [20 lines of output][2025-11-06 02:51:26,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)[2025-11-06 02:51:28,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/setup.py", line 40, in <module>from op_builder import get_default_compute_capabilities, OpBuilderFile "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/op_builder/__init__.py", line 18, in <module>import deepspeed.ops.op_builder # noqa: F401 # type: ignoreFile "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/deepspeed/__init__.py", line 25, in <module>from . import opsFile "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/deepspeed/ops/__init__.py", line 15, in <module>from ..git_version_info import compatible_ops as __compatible_ops__File "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/deepspeed/git_version_info.py", line 29, in <module>op_compatible = builder.is_compatible()File "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/op_builder/fp_quantizer.py", line 35, in is_compatiblesys_cuda_major, _ = installed_cuda_version()File "/tmp/pip-install-t8kn910a/deepspeed_b54f1abae8ed4f2fa6449f0a3b27eac5/op_builder/builder.py", line 51, in installed_cuda_versionraise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")op_builder.builder.MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: wandb==0.18.3 in /home/featurize/work/3/Visual-RFT/envs/lib/python3.10/site-packages (0.18.3)
Collecting flash-attnDownloading https://pypi.tuna.tsinghua.edu.cn/packages/3b/b2/8d76c41ad7974ee264754709c22963447f7f8134613fd9ce80984ed0dab7/flash_attn-2.8.3.tar.gz (8.4 MB)━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 118.4 MB/s 0:00:00Preparing metadata (setup.py) ... errorerror: subprocess-exited-with-error× python setup.py egg_info did not run successfully.│ exit code: 1╰─> [19 lines of output]/tmp/pip-install-5gpi4y2e/flash-attn_859bd58c90b34bf382e1af6b3a9d8e8a/setup.py:106: UserWarning: flash_attn was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.warnings.warn(Traceback (most recent call last):File "<string>", line 2, in <module>File "<pip-setuptools-caller>", line 35, in <module>File "/tmp/pip-install-5gpi4y2e/flash-attn_859bd58c90b34bf382e1af6b3a9d8e8a/setup.py", line 227, in <module>CUDAExtension(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1078, in CUDAExtensionlibrary_dirs += library_paths(cuda=True)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1209, in library_pathsif (not os.path.exists(_join_cuda_home(lib_dir)) andFile "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 2416, in _join_cuda_homeraise OSError('CUDA_HOME environment variable is not set. 'OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.torch.__version__ = 2.5.1+cu124[end of output]note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed× Encountered error while generating package metadata.
╰─> See above for output.note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
>>> import torch
>>>> print("PyTorch 版本:", torch.__version__)
PyTorch 版本: 2.5.1+cu124
>>> print("CUDA 运行时版本:", torch.version.cuda)
CUDA 运行时版本: 12.4
>>>> print("GPU 是否可用:", torch.cuda.is_available())
GPU 是否可用: True
>>>> print("当前 GPU 名称:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "无 GPU")
当前 GPU 名称: NVIDIA GeForce RTX 3060 >>>
项目 检测结果
PyTorch 版本 2.5.1 + cu124
CUDA 运行时版本 12.4
GPU 可用性 ✅ True
GPU 型号 NVIDIA GeForce RTX 3060这说明:你的 PyTorch 已经编译并链接到了 CUDA 12.4;驱动与 GPU 工作正常;目前唯一缺失的是:CUDA Toolkit(含 nvcc 编译器)未安装,所以无法编译 flash-attn 或 deepspeed 这类需要 C++/CUDA 扩展的包。🚀 接下来正确安装 CUDA Toolkit 12.41️⃣ 执行安装命令(推荐 conda 方式):conda install -c nvidia/label/cuda-12.4.1 cuda-toolkit若下载较慢,可用:conda install -c nvidia cuda-toolkit=12.42️⃣ 设置环境变量:export CUDA_HOME=$(dirname $(dirname $(which nvcc)))
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/cloud/3/Visual-RFT/envs/lib3️⃣ 验证安装:nvcc -V输出应类似:Cuda compilation tools, release 12.4, V12.4.1314️⃣ 重新编译依赖:pip install flash-attn==2.8.3 --no-build-isolation
pip install deepspeed==0.15.4 --no-build-isolation
7. bash setup.sh
INFO: pip is looking at multiple versions of transformers to determine which version is compatible with other requirements. This could take a while.
DEPRECATION: git+https://github.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math] contains an egg fragment with a non-PEP 508 name. pip 25.3 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/13157
ERROR: Cannot install None, open-r1 and open-r1[dev]==0.1.0.dev0 because these package versions have conflicting dependencies.The conflict is caused by:open-r1 0.1.0.dev0 depends on huggingface-hub<1.0 and >=0.19.2open-r1[dev] 0.1.0.dev0 depends on huggingface-hub<1.0 and >=0.19.2lighteval 0.6.0.dev0 depends on huggingface_hub>=0.23.0transformers 5.0.0.dev0 depends on huggingface-hub<2.0 and >=1.0.0To fix this you could try to:
1. loosen the range of package versions you've specified
2. remove package versions to allow pip to attempt to solve the dependency conflictERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: wandb==0.18.3 in /home/featurize/work/3/Visual-RFT/envs/lib/python3.10/site-packages (0.18.3)
pip install huggingface-hub==0.23.0
pip install git+https://github.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef
pip install lighteval
8. bash setup.sh
9.
✅ 1️⃣ 模型路径写法错误你现在的:export CKPT_PATH=datasets/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct这里的 datasets/home/... 路径是不对的。
应改成:export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct也就是去掉前面的 datasets/,并确保路径存在(你可以用 ls /home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/ 验证)。✅ 2️⃣ DeepSpeed 配置与 Trainer 参数不匹配你之前遇到过:ValueError: ds gradient_accumulation_steps=1 vs hf gradient_accumulation_steps=2说明 zero3.json 内部与命令行参数不一致。解决方案:
在 local_scripts/zero3.json 中找到:"gradient_accumulation_steps": 1,
"train_batch_size": 2改成:"gradient_accumulation_steps": "auto",
"train_batch_size": "auto"DeepSpeed 会自动与命令行保持一致(这也是官方推荐做法)。✅ 3️⃣ 适配单卡环境的建议优化你可以继续保留这些参数,但建议做如下微调:--report_to none # 不上传 wandb,除非你已经登录了 wandb
--gradient_checkpointing true # 开启梯度检查点以节省显存
--num_generations 4 # 保持 4 更稳(减少显存)✅ 最终推荐命令(单 GPU 稳定版)
export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \--num_generations 4
Traceback (most recent call last):File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 24, in <module>from math_verify import parse, verify
ModuleNotFoundError: No module named 'math_verify'pip install math_verify
Traceback (most recent call last):File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 26, in <module>from open_r1.trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainer
ModuleNotFoundError: No module named 'open_r1'pip install open_r1```python
export PYTHONPATH=$PYTHONPATH:/home/featurize/work/3/Visual-RFT/src/virft/src
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT pip install open_r1
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
ERROR: Could not find a version that satisfies the requirement open_r1 (from versions: none)
ERROR: No matching distribution found for open_r1
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT
```python
/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py
import os
import re
from datetime import datetime
from dataclasses import dataclass, field
from typing import Optionalfrom datasets import load_dataset, load_from_disk
from transformers import Qwen2VLForConditionalGenerationfrom math_verify import parse, verify
# from open_r1.trainer import Qwen2VLGRPOTrainer
#将open_r1.trainer改为src.open_r1.trainer
from src.open_r1.trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainer
from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_configimport json
/cloud/3/Visual-RFT/envs) ➜ Visual-RFT export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \--num_generations 4
Traceback (most recent call last):File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 26, in <module>from open_r1.trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerFile "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/trainer/__init__.py", line 1, in <module>from .grpo_trainer import Qwen2VLGRPOTrainerFile "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/trainer/grpo_trainer.py", line 44, in <module>from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
ModuleNotFoundError: No module named 'trl'
E1106 05:08:05.675000 14957 envs/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 0 (pid: 14983) of binary: /cloud/3/Visual-RFT/envs/bin/python3.10
Traceback (most recent call last):File "/cloud/3/Visual-RFT/envs/bin/torchrun", line 7, in <module>sys.exit(main())File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapperreturn f(*args, **kwargs)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in mainrun(args)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in runelastic_launch(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__return launch_agent(self._config, self._entrypoint, list(args))File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agentraise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
src/virft/src/open_r1/grpo.py FAILED
------------------------------------------------------------
Failures:<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:time : 2025-11-06_05:08:05host : localhostrank : 0 (local_rank: 0)exitcode : 1 (pid: 14983)error_file: <N/A>traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT pip install trl==0.9.6Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting trl==0.9.6Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a5/c3/6565c2c376a829f99da20d39c2912405195ec1fa6aae068dc45c46793e72/trl-0.9.6-py3-none-any.whl (245 kB)
Requirement already satisfied: torch>=1.4.0 in ./envs/lib/python3.10/site-packages (from trl==0.9.6) (2.5.1)
Requirement already satisfied: transformers>=4.31.0 in ./envs/lib/python3.10/site-packages (from trl==0.9.6) (4.49.0.dev0)
Requirement already satisfied: numpy<2.0.0,>=1.18.2 in ./envs/lib/python3.10/site-packages (from trl==0.9.6) (1.26.4)
Requirement already satisfied: accelerate in ./envs/lib/python3.10/site-packages (from trl==0.9.6) (1.11.0)
Requirement already satisfied: datasets in ./envs/lib/python3.10/site-packages (from trl==0.9.6) (4.4.0)
Collecting tyro>=0.5.11 (from trl==0.9.6)Downloading https://pypi.tuna.tsinghua.edu.cn/packages/bb/bc/943b35b1a139a602a730e56592a3639bf796a87c91eebcd40cc51043ee5b/tyro-0.9.35-py3-none-any.whl (132 kB)
Requirement already satisfied: filelock in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (3.20.0)
Requirement already satisfied: typing-extensions>=4.8.0 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (4.15.0)
Requirement already satisfied: networkx in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (3.4.2)
Requirement already satisfied: jinja2 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (3.1.6)
Requirement already satisfied: fsspec in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (2025.9.0)
Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.4.127)
Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.4.127)
Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.4.127)
Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (9.1.0.70)
Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.4.5.8)
Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (11.2.1.3)
Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (10.3.5.147)
Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (11.6.1.9)
Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.3.1.170)
Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (2.21.5)
Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.4.127)
Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (12.4.127)
Requirement already satisfied: triton==3.1.0 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (3.1.0)
Requirement already satisfied: sympy==1.13.1 in ./envs/lib/python3.10/site-packages (from torch>=1.4.0->trl==0.9.6) (1.13.1)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./envs/lib/python3.10/site-packages (from sympy==1.13.1->torch>=1.4.0->trl==0.9.6) (1.3.0)
Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (0.36.0)
Requirement already satisfied: packaging>=20.0 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (25.0)
Requirement already satisfied: pyyaml>=5.1 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (6.0.3)
Requirement already satisfied: regex!=2019.12.17 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (2025.11.3)
Requirement already satisfied: requests in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (2.32.5)
Requirement already satisfied: tokenizers<0.22,>=0.21 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (0.21.4)
Requirement already satisfied: safetensors>=0.4.1 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (0.6.2)
Requirement already satisfied: tqdm>=4.27 in ./envs/lib/python3.10/site-packages (from transformers>=4.31.0->trl==0.9.6) (4.67.1)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in ./envs/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers>=4.31.0->trl==0.9.6) (1.2.0)
Requirement already satisfied: docstring-parser>=0.15 in ./envs/lib/python3.10/site-packages (from tyro>=0.5.11->trl==0.9.6) (0.17.0)
Requirement already satisfied: rich>=11.1.0 in ./envs/lib/python3.10/site-packages (from tyro>=0.5.11->trl==0.9.6) (14.2.0)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.9.6)Downloading https://pypi.tuna.tsinghua.edu.cn/packages/74/03/3271b7bb470fbab4adf5bd30b0d32143909d96f3608d815b447357f47f2b/shtab-1.7.2-py3-none-any.whl (14 kB)
Collecting typeguard>=4.0.0 (from tyro>=0.5.11->trl==0.9.6)Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1b/a9/e3aee762739c1d7528da1c3e06d518503f8b6c439c35549b53735ba52ead/typeguard-4.4.4-py3-none-any.whl (34 kB)
Requirement already satisfied: markdown-it-py>=2.2.0 in ./envs/lib/python3.10/site-packages (from rich>=11.1.0->tyro>=0.5.11->trl==0.9.6) (4.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./envs/lib/python3.10/site-packages (from rich>=11.1.0->tyro>=0.5.11->trl==0.9.6) (2.19.2)
Requirement already satisfied: mdurl~=0.1 in ./envs/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=11.1.0->tyro>=0.5.11->trl==0.9.6) (0.1.2)
Requirement already satisfied: psutil in ./envs/lib/python3.10/site-packages (from accelerate->trl==0.9.6) (7.1.3)
Requirement already satisfied: pyarrow>=21.0.0 in ./envs/lib/python3.10/site-packages (from datasets->trl==0.9.6) (22.0.0)
Requirement already satisfied: dill<0.4.1,>=0.3.0 in ./envs/lib/python3.10/site-packages (from datasets->trl==0.9.6) (0.4.0)
Requirement already satisfied: pandas in ./envs/lib/python3.10/site-packages (from datasets->trl==0.9.6) (2.3.3)
Requirement already satisfied: httpx<1.0.0 in ./envs/lib/python3.10/site-packages (from datasets->trl==0.9.6) (0.28.1)
Requirement already satisfied: xxhash in ./envs/lib/python3.10/site-packages (from datasets->trl==0.9.6) (3.6.0)
Requirement already satisfied: multiprocess<0.70.19 in ./envs/lib/python3.10/site-packages (from datasets->trl==0.9.6) (0.70.18)
Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in ./envs/lib/python3.10/site-packages (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (3.13.2)
Requirement already satisfied: anyio in ./envs/lib/python3.10/site-packages (from httpx<1.0.0->datasets->trl==0.9.6) (4.11.0)
Requirement already satisfied: certifi in ./envs/lib/python3.10/site-packages (from httpx<1.0.0->datasets->trl==0.9.6) (2025.10.5)
Requirement already satisfied: httpcore==1.* in ./envs/lib/python3.10/site-packages (from httpx<1.0.0->datasets->trl==0.9.6) (1.0.9)
Requirement already satisfied: idna in ./envs/lib/python3.10/site-packages (from httpx<1.0.0->datasets->trl==0.9.6) (3.11)
Requirement already satisfied: h11>=0.16 in ./envs/lib/python3.10/site-packages (from httpcore==1.*->httpx<1.0.0->datasets->trl==0.9.6) (0.16.0)
Requirement already satisfied: aiohappyeyeballs>=2.5.0 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (2.6.1)
Requirement already satisfied: aiosignal>=1.4.0 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (1.4.0)
Requirement already satisfied: async-timeout<6.0,>=4.0 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (5.0.1)
Requirement already satisfied: attrs>=17.3.0 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (25.4.0)
Requirement already satisfied: frozenlist>=1.1.1 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (1.8.0)
Requirement already satisfied: multidict<7.0,>=4.5 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (6.7.0)
Requirement already satisfied: propcache>=0.2.0 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (0.4.1)
Requirement already satisfied: yarl<2.0,>=1.17.0 in ./envs/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets->trl==0.9.6) (1.22.0)
Requirement already satisfied: charset_normalizer<4,>=2 in ./envs/lib/python3.10/site-packages (from requests->transformers>=4.31.0->trl==0.9.6) (3.4.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in ./envs/lib/python3.10/site-packages (from requests->transformers>=4.31.0->trl==0.9.6) (2.5.0)
Requirement already satisfied: exceptiongroup>=1.0.2 in ./envs/lib/python3.10/site-packages (from anyio->httpx<1.0.0->datasets->trl==0.9.6) (1.3.0)
Requirement already satisfied: sniffio>=1.1 in ./envs/lib/python3.10/site-packages (from anyio->httpx<1.0.0->datasets->trl==0.9.6) (1.3.1)
Requirement already satisfied: MarkupSafe>=2.0 in ./envs/lib/python3.10/site-packages (from jinja2->torch>=1.4.0->trl==0.9.6) (3.0.3)
Requirement already satisfied: python-dateutil>=2.8.2 in ./envs/lib/python3.10/site-packages (from pandas->datasets->trl==0.9.6) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in ./envs/lib/python3.10/site-packages (from pandas->datasets->trl==0.9.6) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in ./envs/lib/python3.10/site-packages (from pandas->datasets->trl==0.9.6) (2025.2)
Requirement already satisfied: six>=1.5 in ./envs/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets->trl==0.9.6) (1.17.0)
Installing collected packages: typeguard, shtab, tyro, trl
Successfully installed shtab-1.7.2 trl-0.9.6 typeguard-4.4.4 tyro-0.9.35
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \--num_generations 4
Traceback (most recent call last):File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 26, in <module>from open_r1.trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerFile "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/trainer/__init__.py", line 1, in <module>from .grpo_trainer import Qwen2VLGRPOTrainerFile "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/trainer/grpo_trainer.py", line 44, in <module>from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
ModuleNotFoundError: No module named 'trl.data_utils'
E1106 05:12:32.712000 15927 envs/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 0 (pid: 15946) of binary: /cloud/3/Visual-RFT/envs/bin/python3.10
Traceback (most recent call last):File "/cloud/3/Visual-RFT/envs/bin/torchrun", line 7, in <module>sys.exit(main())File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapperreturn f(*args, **kwargs)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in mainrun(args)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in runelastic_launch(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__return launch_agent(self._config, self._entrypoint, list(args))File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agentraise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
src/virft/src/open_r1/grpo.py FAILED
------------------------------------------------------------
Failures:<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:time : 2025-11-06_05:12:32host : localhostrank : 0 (local_rank: 0)exitcode : 1 (pid: 15946)error_file: <N/A>traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \--num_generations 4
/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/trl/import_utils.py:91: UserWarning: TRL currently only supports vLLM version `0.10.2`. You have version 0.7.2 installed. We recommend to install this version to avoid compatibility issues.warnings.warn(
INFO 11-06 05:14:47 __init__.py:190] Automatically detected platform cuda.
/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/trl/import_utils.py:91: UserWarning: TRL currently only supports vLLM version `0.10.2`. You have version 0.7.2 installed. We recommend to install this version to avoid compatibility issues.warnings.warn(
usage: grpo.py [-h] [--dataset_name DATASET_NAME] [--dataset_config DATASET_CONFIG] [--dataset_train_split DATASET_TRAIN_SPLIT][--dataset_test_split DATASET_TEST_SPLIT] [--dataset_streaming [DATASET_STREAMING]][--gradient_checkpointing_use_reentrant [GRADIENT_CHECKPOINTING_USE_REENTRANT]] [--ignore_bias_buffers [IGNORE_BIAS_BUFFERS]][--reward_funcs REWARD_FUNCS [REWARD_FUNCS ...]] [--max_pixels MAX_PIXELS] [--min_pixels MIN_PIXELS] [--output_dir OUTPUT_DIR][--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] [--do_predict [DO_PREDICT]][--eval_strategy {no,steps,epoch}] [--prediction_loss_only [PREDICTION_LOSS_ONLY]][--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE][--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE][--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS][--eval_delay EVAL_DELAY] [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] [--learning_rate LEARNING_RATE][--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON][--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_steps MAX_STEPS][--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,cosine_warmup_with_min_lr,warmup_stable_decay}][--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS][--log_level {detail,debug,info,warning,error,critical,passive}][--log_level_replica {detail,debug,info,warning,error,critical,passive}] [--log_on_each_node [LOG_ON_EACH_NODE]][--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}][--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]][--no_logging_nan_inf_filter] [--save_strategy {no,steps,epoch,best}] [--save_steps SAVE_STEPS][--save_total_limit SAVE_TOTAL_LIMIT] [--save_safetensors [SAVE_SAFETENSORS]] [--no_save_safetensors][--save_on_each_node [SAVE_ON_EACH_NODE]] [--save_only_model [SAVE_ONLY_MODEL]][--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]][--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] [--bf16 BF16][--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL] [--half_precision_backend {auto,apex,cpu_amp}][--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] [--local_rank LOCAL_RANK][--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl,mccl}] [--tpu_num_cores TPU_NUM_CORES] [--tpu_metrics_debug [TPU_METRICS_DEBUG]][--debug DEBUG [DEBUG ...]] [--dataloader_drop_last [DATALOADER_DROP_LAST]] [--eval_steps EVAL_STEPS][--dataloader_num_workers DATALOADER_NUM_WORKERS] [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR][--past_index PAST_INDEX] [--run_name RUN_NAME] [--disable_tqdm DISABLE_TQDM] [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]][--label_names LABEL_NAMES [LABEL_NAMES ...]] [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]][--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER] [--ignore_data_skip [IGNORE_DATA_SKIP]][--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG][--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--accelerator_config ACCELERATOR_CONFIG][--parallelism_config PARALLELISM_CONFIG] [--deepspeed DEEPSPEED] [--label_smoothing_factor LABEL_SMOOTHING_FACTOR][--optim {adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,adamw_torch_4bit,adamw_torch_8bit,ademamix,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,ademamix_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_ademamix_32bit,paged_ademamix_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo,grokadamw,schedule_free_radam,schedule_free_adamw,schedule_free_sgd,apollo_adamw,apollo_adamw_layerwise,stable_adamw}][--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]][--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO] [--project PROJECT] [--trackio_space_id TRACKIO_SPACE_ID][--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB][--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] [--no_dataloader_pin_memory][--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] [--skip_memory_metrics [SKIP_MEMORY_METRICS]][--no_skip_memory_metrics] [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] [--push_to_hub [PUSH_TO_HUB]][--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID][--hub_strategy {end,every_save,checkpoint,all_checkpoints}] [--hub_token HUB_TOKEN] [--hub_private_repo HUB_PRIVATE_REPO][--hub_always_push [HUB_ALWAYS_PUSH]] [--hub_revision HUB_REVISION] [--gradient_checkpointing [GRADIENT_CHECKPOINTING]][--no_gradient_checkpointing] [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS][--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] [--include_for_metrics INCLUDE_FOR_METRICS [INCLUDE_FOR_METRICS ...]][--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] [--no_eval_do_concat_batches] [--fp16_backend {auto,apex,cpu_amp}][--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION][--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS] [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]][--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT][--torch_compile [TORCH_COMPILE]] [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE][--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]][--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] [--optim_target_modules OPTIM_TARGET_MODULES][--batch_eval_metrics [BATCH_EVAL_METRICS]] [--eval_on_start [EVAL_ON_START]] [--use_liger_kernel [USE_LIGER_KERNEL]][--liger_kernel_config LIGER_KERNEL_CONFIG] [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]][--average_tokens_across_devices [AVERAGE_TOKENS_ACROSS_DEVICES]] [--no_average_tokens_across_devices][--model_init_kwargs MODEL_INIT_KWARGS] [--disable_dropout [DISABLE_DROPOUT]] [--cast_lm_head_to_fp32 [CAST_LM_HEAD_TO_FP32]][--max_prompt_length MAX_PROMPT_LENGTH] [--num_generations NUM_GENERATIONS] [--max_completion_length MAX_COMPLETION_LENGTH][--ds3_gather_for_generation [DS3_GATHER_FOR_GENERATION]] [--no_ds3_gather_for_generation] [--shuffle_dataset [SHUFFLE_DATASET]][--no_shuffle_dataset] [--generation_batch_size GENERATION_BATCH_SIZE] [--steps_per_generation STEPS_PER_GENERATION][--temperature TEMPERATURE] [--top_p TOP_P] [--top_k TOP_K] [--min_p MIN_P] [--generation_kwargs GENERATION_KWARGS][--chat_template_kwargs CHAT_TEMPLATE_KWARGS] [--repetition_penalty REPETITION_PENALTY][--use_transformers_paged [USE_TRANSFORMERS_PAGED]] [--cache_implementation CACHE_IMPLEMENTATION] [--use_vllm [USE_VLLM]][--vllm_mode VLLM_MODE] [--vllm_model_impl VLLM_MODEL_IMPL] [--vllm_enable_sleep_mode [VLLM_ENABLE_SLEEP_MODE]][--vllm_guided_decoding_regex VLLM_GUIDED_DECODING_REGEX] [--vllm_server_base_url VLLM_SERVER_BASE_URL][--vllm_server_host VLLM_SERVER_HOST] [--vllm_server_port VLLM_SERVER_PORT] [--vllm_server_timeout VLLM_SERVER_TIMEOUT][--vllm_gpu_memory_utilization VLLM_GPU_MEMORY_UTILIZATION] [--vllm_tensor_parallel_size VLLM_TENSOR_PARALLEL_SIZE] [--beta BETA][--num_iterations NUM_ITERATIONS] [--epsilon EPSILON] [--delta DELTA] [--epsilon_high EPSILON_HIGH][--importance_sampling_level IMPORTANCE_SAMPLING_LEVEL] [--reward_weights REWARD_WEIGHTS [REWARD_WEIGHTS ...]][--scale_rewards SCALE_REWARDS] [--loss_type LOSS_TYPE] [--mask_truncated_completions [MASK_TRUNCATED_COMPLETIONS]][--sync_ref_model [SYNC_REF_MODEL]] [--ref_model_mixup_alpha REF_MODEL_MIXUP_ALPHA] [--ref_model_sync_steps REF_MODEL_SYNC_STEPS][--top_entropy_quantile TOP_ENTROPY_QUANTILE] [--use_liger_loss [USE_LIGER_LOSS]][--vllm_importance_sampling_correction [VLLM_IMPORTANCE_SAMPLING_CORRECTION]] [--no_vllm_importance_sampling_correction][--vllm_importance_sampling_cap VLLM_IMPORTANCE_SAMPLING_CAP] [--log_completions [LOG_COMPLETIONS]][--num_completions_to_print NUM_COMPLETIONS_TO_PRINT] [--wandb_log_unique_prompts [WANDB_LOG_UNIQUE_PROMPTS]][--model_name_or_path MODEL_NAME_OR_PATH] [--model_revision MODEL_REVISION] [--dtype {auto,bfloat16,float16,float32}][--trust_remote_code [TRUST_REMOTE_CODE]] [--attn_implementation ATTN_IMPLEMENTATION] [--use_peft [USE_PEFT]] [--lora_r LORA_R][--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] [--lora_target_modules LORA_TARGET_MODULES [LORA_TARGET_MODULES ...]][--lora_target_parameters LORA_TARGET_PARAMETERS [LORA_TARGET_PARAMETERS ...]][--lora_modules_to_save LORA_MODULES_TO_SAVE [LORA_MODULES_TO_SAVE ...]] [--lora_task_type LORA_TASK_TYPE][--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] [--load_in_8bit [LOAD_IN_8BIT]] [--load_in_4bit [LOAD_IN_4BIT]][--bnb_4bit_quant_type {fp4,nf4}] [--use_bnb_nested_quant [USE_BNB_NESTED_QUANT]] [--torch_dtype {auto,bfloat16,float16,float32}]
grpo.py: error: argument --bf16: expected one argument
E1106 05:14:51.906000 16452 envs/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 2) local_rank: 0 (pid: 16465) of binary: /cloud/3/Visual-RFT/envs/bin/python3.10
Traceback (most recent call last):File "/cloud/3/Visual-RFT/envs/bin/torchrun", line 7, in <module>sys.exit(main())File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapperreturn f(*args, **kwargs)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in mainrun(args)File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in runelastic_launch(File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__return launch_agent(self._config, self._entrypoint, list(args))File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agentraise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
src/virft/src/open_r1/grpo.py FAILED
------------------------------------------------------------
Failures:<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:time : 2025-11-06_05:14:51host : localhostrank : 0 (local_rank: 0)exitcode : 2 (pid: 16465)error_file: <N/A>traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT
报错核心:
grpo.py: error: argument --bf16: expected one argument这说明 --bf16 参数在这个版本的 transformers / Trainer 中 需要指定布尔值(例如 --bf16 true),而不是一个 flag(空参数)。✅ 修正命令把以下一行:--bf16 \改为:--bf16 true \✅ 修正后完整命令(适配单 GPU 环境)
export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 true \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \--num_generations 4
[2025-11-06 05:19:02,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-11-06 05:19:05,153] [INFO] [comm.py:652:init_distributed] cdb=None
[2025-11-06 05:19:05,154] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[rank0]: Traceback (most recent call last):
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 148, in __init__
[rank0]: config = json.loads(config_file_or_dict)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/json/__init__.py", line 346, in loads
[rank0]: return _default_decoder.decode(s)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/json/decoder.py", line 337, in decode
[rank0]: obj, end = self.raw_decode(s, idx=_w(s, 0).end())
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/json/decoder.py", line 355, in raw_decode
[rank0]: raise JSONDecodeError("Expecting value", s, err.value) from None
[rank0]: json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)[rank0]: During handling of the above exception, another exception occurred:[rank0]: Traceback (most recent call last):
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 151, in __init__
[rank0]: config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/base64.py", line 133, in urlsafe_b64decode
[rank0]: return b64decode(s)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/base64.py", line 87, in b64decode
[rank0]: return binascii.a2b_base64(s)
[rank0]: binascii.Error: Incorrect padding[rank0]: During handling of the above exception, another exception occurred:[rank0]: Traceback (most recent call last):
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 450, in <module>
[rank0]: script_args, training_args, model_args = parser.parse_args_and_config()
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/trl/scripts/utils.py", line 353, in parse_args_and_config
[rank0]: output = self.parse_args_into_dataclasses(args=args, return_remaining_strings=return_remaining_strings)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/hf_argparser.py", line 345, in parse_args_into_dataclasses
[rank0]: obj = dtype(**inputs)
[rank0]: File "<string>", line 185, in __init__
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/trl/trainer/grpo_config.py", line 672, in __post_init__
[rank0]: super().__post_init__()
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/training_args.py", line 2074, in __post_init__
[rank0]: self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/integrations/deepspeed.py", line 89, in __init__
[rank0]: super().__init__(config_file_or_dict)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/integrations/deepspeed.py", line 79, in __init__
[rank0]: super().__init__(config_file_or_dict)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 154, in __init__
[rank0]: raise ValueError(
[rank0]: ValueError: Expected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: local_scripts/zero3.json
将/home/featurize/work/3/Visual-RFT/src/virft/local_scripts/zero3.json--deepspeed src/virft/local_scripts/zero3.json
export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed src/virft/local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 true \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \--num_generations 4
ank0]: Traceback (most recent call last):
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 450, in <module>
[rank0]: script_args, training_args, model_args = parser.parse_args_and_config()
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/trl/scripts/utils.py", line 353, in parse_args_and_config
[rank0]: output = self.parse_args_into_dataclasses(args=args, return_remaining_strings=return_remaining_strings)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/hf_argparser.py", line 345, in parse_args_into_dataclasses
[rank0]: obj = dtype(**inputs)
[rank0]: File "<string>", line 185, in __init__
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/trl/trainer/grpo_config.py", line 709, in __post_init__
[rank0]: raise ValueError(
[rank0]: ValueError: generation_batch_size (2) must be divisible by num_generations (4).
```python
export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed src/virft/local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 true \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \
--generation_batch_size 4 \--num_generations 4
–generation_batch_size 8一定要被–num_generations 4整除即8/4=2,8%4=0
–generation_batch_size 4
–num_generations 4
```python
[2025-11-06 05:56:48,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-11-06 05:56:50,382] [INFO] [comm.py:652:init_distributed] cdb=None
[2025-11-06 05:56:50,382] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 451, in <module>
[rank0]: main(script_args, training_args, model_args)
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 385, in main
[rank0]: dataset = DatasetDict.load_from_disk(script_args.dataset_name)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/datasets/dataset_dict.py", line 1411, in load_from_disk
[rank0]: raise FileNotFoundError(
[rank0]: FileNotFoundError: No such file: '/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65/dataset_dict.json'. Expected to load a `DatasetDict` object, but provided path is not a `DatasetDict`.
/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py
def main(script_args, training_args, model_args):# Get reward functionsscript_args.reward_funcs = ['accuracy_iou','accuracy_confidence','format']reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]# Load the dataset from huggingface# dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)# Load the dataset from local diskfrom datasets import load_dataset# dataset = load_dataset("json", data_files=f"{script_args.dataset_name}/data/*.json")dataset = load_dataset(script_args.dataset_name)# from datasets import DatasetDict# dataset = DatasetDict.load_from_disk(script_args.dataset_name)
export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed src/virft/local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 true \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \
--generation_batch_size 4 \--num_generations 4
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 454, in <module>
[rank0]: main(script_args, training_args, model_args)
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 430, in main
[rank0]: trainer = trainer_cls(
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/trainer/grpo_trainer.py", line 191, in __init__
[rank0]: model = Qwen2VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4734, in from_pretrained
[rank0]: resolved_config_file = cached_file(
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 322, in cached_file
[rank0]: file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 531, in cached_files
[rank0]: resolved_files = [
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 532, in <listcomp>
[rank0]: _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision, repo_type)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 143, in _get_cache_file_to_return
[rank0]: resolved_file = try_to_load_from_cache(
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
[rank0]: validate_repo_id(arg_value)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
[rank0]: raise HFValidationError(
[rank0]: huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct'. Use `repo_type` argument if needed.
using: <class 'open_r1.trainer.grpo_trainer.Qwen2VLGRPOTrainer'>
[rank0]: Traceback (most recent call last):
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 479, in cached_files
[rank0]: hf_hub_download(
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
[rank0]: validate_repo_id(arg_value)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
[rank0]: raise HFValidationError(
[rank0]: huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct'. Use `repo_type` argument if needed.[rank0]: During handling of the above exception, another exception occurred:[rank0]: Traceback (most recent call last):
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 454, in <module>
[rank0]: main(script_args, training_args, model_args)
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/grpo.py", line 430, in main
[rank0]: trainer = trainer_cls(
[rank0]: File "/home/featurize/work/3/Visual-RFT/src/virft/src/open_r1/trainer/grpo_trainer.py", line 191, in __init__
[rank0]: model = Qwen2VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4734, in from_pretrained
[rank0]: resolved_config_file = cached_file(
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 322, in cached_file
[rank0]: file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 531, in cached_files
[rank0]: resolved_files = [
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 532, in <listcomp>
[rank0]: _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision, repo_type)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/transformers/utils/hub.py", line 143, in _get_cache_file_to_return
[rank0]: resolved_file = try_to_load_from_cache(
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
[rank0]: validate_repo_id(arg_value)
[rank0]: File "/cloud/3/Visual-RFT/envs/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
[rank0]: raise HFValidationError(
[rank0]: huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/featurize/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct'. Use `repo_type` argument if needed.
检查路径
export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/datasets/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
(base) ➜ ~ conda activate /cloud/3/Visual-RFT/envs
(/cloud/3/Visual-RFT/envs) ➜ ~ cd /home/featurize/work/3/Visual-RFT
(/cloud/3/Visual-RFT/envs) ➜ Visual-RFT (/cloud/3/Visual-RFT/envs) ➜ Visual-RFT export DEBUG_MODE="true"
export LOG_PATH="./debug_log_2b_GRPO_coco_base65cate_6k.txt"export DATA_PATH=/home/featurize/datasets/17d32227-d07f-4b13-bfd0-f9ba53a1304f/ViRFT_COCO_base65
export CKPT_PATH=/home/featurize/datasets/280401ca-8560-4955-83ad-dd6566cc12af/QwenQwen2-VL-2B-Instruct
export SAVE_PATH=./share_models/Qwen2-VL-2B-Instruct_GRPO_coco_base65cate_6ktorchrun --nproc_per_node=1 \--nnodes=1 \--node_rank=0 \--master_addr=127.0.0.1 \--master_port=12345 \src/virft/src/open_r1/grpo.py \--output_dir ${SAVE_PATH} \--model_name_or_path ${CKPT_PATH} \--dataset_name ${DATA_PATH} \--deepspeed src/virft/local_scripts/zero3.json \--max_prompt_length 1024 \--per_device_train_batch_size 1 \--gradient_accumulation_steps 2 \--logging_steps 1 \--bf16 true \--report_to none \--gradient_checkpointing true \--attn_implementation flash_attention_2 \--max_pixels 401408 \--num_train_epochs 1 \--run_name Qwen2-VL-2B_GRPO_coco_base65cate_6k \--save_steps 100 \--save_only_model true \
--generation_batch_size 4 \--num_generations 4
