bench_cluster

TODO: git submodule for specific nanotron branch

pip install -e .
pip install -r requirements.txt
cd nanotron # Checkout bench_cluster branch
pip install -e .
pip install flash_attn==2.5.0
cd ..

Workflow

results/
    - network_bench/
        - network_bench_8_gpus.slurm
        - log_8_gpus.out
        - ...
        - network_bench_512_gpus.slurm
    - llama-1B/
        - 8_GPUS/
            - 8_GPUS_summary_results.csv
            - dp-1_tp-8_pp-1_mbz-1/
                - profiler/*.json
                - bench.slurm
                - config.yaml
                - log_metrics.csv
                - log.out
                - profiler.csv
                - status.txt
            ...
            - dp-8_tp-1_pp-1_mbz-256/
        ...
        - 512_GPUS/
    ...
    - llama-7B/

Usage

# Create above workflow with all possible combinations of hyper-parameters 
python main.py create_configs --out_dir "results" --model llama-1B --gpus 8

# Create configs without profiler on Swiss cluster
python main.py create_configs --out_dir "results" --model llama-1B --gpus 4 --exp_name 4_GPUS_no_profiler --no_profiler  --cluster swiss-ai

# Create above workflow with all possible combinations and name it 8_GPUS_FOLDER + disable profiler
python main.py create_configs --out_dir "results" --model llama-1B --gpus 8 --exp_name 8_GPUS_FOLDER --no_profiler

# Create above workflow with only combinations of DP 
python main.py create_configs --out_dir "results" --model llama-1B --gpus 8 --tp_max=1  --pp_max=1

# Create configs witt global batch size ranging from 0M to 4M tokens. Include config that increase every 1M tokens as well
python main.py create_configs --out_dir "results"--model llama-1B --gpus 8 --gbs_range "[0M, 4M, 1M]"

# Launch all the jobs in `results/` folder 
python main.py submit_jobs --inp_dir results/  --qos high --hf_token <YOUR_HF_TOKEN> 

# Can as well batch jobs into 4 dependencies array 
python main.py submit_jobs --inp_dir results/ --qos high --hf_token <YOUR_HF_TOKEN> --nb_slurm_array 4

# Check status of runs (INIT/PENDING/RUNNING/FAIL/OOM/COMPLETED)
./check_status.sh results/

# Will cancel jobs that were not properly cancel by slurm (to avoid wasting ressources)
sbatch healthcheck_jobs.slurm

# Automatically rerun the jobs with status FAIL
python main.py submit_jobs --inp_dir results/  --qos high --hf_token <YOUR_HF_TOKEN> --only_fails

# Bench intra/inter-connect of gpus
python main.py network_bench --out_dir results/ --qos=high --gpus=8

# Extract into CSV logs, network and profiler info (NOTE: this is automatically done when using `submit_jobs`)
python main.py report --inp_dir results/ [--is_logs | --is_network | --is_profiler]

# Create a global summary CSV file based on all exisiting csv runs file
python main.py report --inp_dir results/  --global_summary

Name		Name	Last commit message	Last commit date
Latest commit History 99 Commits
bench_cluster		bench_cluster
Dockerfile.bench_cluster		Dockerfile.bench_cluster
README.md		README.md
check_status.sh		check_status.sh
generate_swiss.sh		generate_swiss.sh
healthcheck_jobs.slurm		healthcheck_jobs.slurm
main.py		main.py
open_logs_with_status.sh		open_logs_with_status.sh
overlap.sh		overlap.sh
requirements.txt		requirements.txt
scancel_jobs.sh		scancel_jobs.sh
setup.py		setup.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

bench_cluster

Workflow

Usage

About

Releases

Packages

Languages

huggingface/bench_cluster

Folders and files

Latest commit

History

Repository files navigation

bench_cluster

Workflow

Usage

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages