export CUDA_VISIBLE_DEVICES=0 python train_grpo.py