Add downstream task evaluation script for llama3_native_te recipe

savitha-eng · savitha-eng · commit d37f8c99cffc · 2026-03-18T01:01:38.000Z
Adds eval_downstream.py that runs lm-eval benchmarks (arc_challenge,
arc_easy, boolq, copa, hellaswag, piqa, winogrande) on trained Lingua
1B checkpoints. Supports both consolidated final_model directories and
distributed FSDP2 step checkpoints.

Made-with: Cursor
diff --git a/bionemo-recipes/recipes/llama3_native_te/eval_downstream.py b/bionemo-recipes/recipes/llama3_native_te/eval_downstream.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python
+
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluate a trained Llama checkpoint on downstream NLP benchmarks using lm-eval.
+
+Supports loading from:
+  1. A consolidated final_model directory (model.safetensors + config.json)
+  2. A distributed FSDP2 training checkpoint (step_N directory)
+
+Examples:
+    # From a consolidated final_model (single GPU, no torchrun needed):
+    python eval_downstream.py \
+        --checkpoint-path /path/to/ckpt_dir/train_fsdp2/final_model
+
+    # From a distributed FSDP2 checkpoint (needs torchrun for weight gathering):
+    torchrun --nproc_per_node=1 eval_downstream.py \
+        --checkpoint-path /path/to/ckpt_dir/train_fsdp2/step_60000 \
+        --from-distributed \
+        --model-config ./model_configs/lingua-1B
+
+    # Custom tasks and batch size:
+    python eval_downstream.py \
+        --checkpoint-path /path/to/final_model \
+        --tasks arc_easy,hellaswag \
+        --batch-size 16
+
+    # Save results to a file:
+    python eval_downstream.py \
+        --checkpoint-path /path/to/final_model \
+        --output-path ./eval_results
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+
+DOWNSTREAM_TASKS = "arc_challenge,arc_easy,boolq,copa,hellaswag,piqa,winogrande"
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Evaluate a trained checkpoint on downstream NLP tasks with lm-eval.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--checkpoint-path",
+        type=str,
+        required=True,
+        help="Path to checkpoint. Either a final_model dir (with model.safetensors) "
+        "or a step_N distributed checkpoint dir (with --from-distributed).",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="meta-llama/Meta-Llama-3-8B",
+        help="Tokenizer name or path (default: meta-llama/Meta-Llama-3-8B).",
+    )
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        default=DOWNSTREAM_TASKS,
+        help=f"Comma-separated lm-eval task names (default: {DOWNSTREAM_TASKS}).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=str,
+        default="auto",
+        help="Batch size for lm-eval. Use 'auto' for automatic selection (default: auto).",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+        help="Device for lm-eval inference (default: cuda:0).",
+    )
+    parser.add_argument(
+        "--eval-dir",
+        type=str,
+        default=None,
+        help="Directory to store the prepared eval checkpoint. Uses a temp directory if not set.",
+    )
+    parser.add_argument(
+        "--from-distributed",
+        action="store_true",
+        help="Treat --checkpoint-path as a distributed FSDP2 checkpoint. Requires torchrun.",
+    )
+    parser.add_argument(
+        "--model-config",
+        type=str,
+        default="./model_configs/lingua-1B",
+        help="Model config path for --from-distributed (default: ./model_configs/lingua-1B).",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        default=None,
+        help="Path to save lm-eval results JSON.",
+    )
+    parser.add_argument(
+        "--num-fewshot",
+        type=int,
+        default=None,
+        help="Number of few-shot examples (default: lm-eval task default).",
+    )
+    return parser.parse_args()
+
+
+def export_distributed_checkpoint(checkpoint_path: str, model_config: str, output_path: str) -> bool:
+    """Load a distributed FSDP2 checkpoint and export consolidated weights.
+
+    Must be called inside a torchrun context. All ranks participate in loading
+    and gathering, but only rank 0 saves the exported model.
+
+    Args:
+        checkpoint_path: Path to the step_N distributed checkpoint directory.
+        model_config: Path to model config (e.g. ./model_configs/lingua-1B).
+        output_path: Directory to save the consolidated model.
+
+    Returns:
+        True if this is rank 0 (should continue to evaluation), False otherwise.
+    """
+    import torch
+    from safetensors.torch import save_file
+    from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
+    from torch.distributed.checkpoint.state_dict_loader import load as dcp_load
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.fsdp import fully_shard
+
+    from checkpoint import AppState
+    from distributed_config import DistributedConfig
+    from modeling_llama_te import NVLlamaConfig, NVLlamaForCausalLM
+    from scheduler import get_cosine_annealing_schedule_with_warmup
+
+    dist_config = DistributedConfig()
+    device = torch.device(f"cuda:{dist_config.local_rank}")
+    torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl", device_id=device)
+    torch.cuda.set_device(dist_config.local_rank)
+    device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",))
+
+    print(f"[Rank {dist_config.rank}] Loading distributed checkpoint from {checkpoint_path}")
+
+    config = NVLlamaConfig.from_pretrained(model_config, dtype=torch.bfloat16, attn_input_format="thd")
+    with torch.device("meta"):
+        model = NVLlamaForCausalLM(config)
+
+    for layer in model.model.layers:
+        fully_shard(layer, mesh=device_mesh["dp"])
+    fully_shard(model, mesh=device_mesh["dp"])
+
+    model.init_empty_weights()
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    scheduler = get_cosine_annealing_schedule_with_warmup(optimizer, num_warmup_steps=1, num_decay_steps=1)
+
+    app_state = AppState(model=model, optimizer=optimizer, scheduler=scheduler)
+    state_dict = {"app": app_state}
+    dcp_load(state_dict, checkpoint_id=checkpoint_path, process_group=device_mesh.get_group("dp"))
+
+    print(f"[Rank {dist_config.rank}] Loaded checkpoint at step {app_state.step}")
+
+    model_state_dict = get_model_state_dict(
+        model=model,
+        options=StateDictOptions(full_state_dict=True, cpu_offload=True),
+    )
+
+    if dist_config.is_main_process():
+        os.makedirs(output_path, exist_ok=True)
+        save_file(model_state_dict, os.path.join(output_path, "model.safetensors"))
+        config.save_pretrained(output_path)
+        print(f"Exported consolidated model to {output_path}")
+
+    torch.distributed.barrier()
+    torch.distributed.destroy_process_group()
+
+    return dist_config.is_main_process()
+
+
+def prepare_eval_directory(checkpoint_path: str, output_path: str, tokenizer_name: str) -> str:
+    """Prepare a checkpoint directory with all files lm-eval needs.
+
+    Copies model files, patches config.json with auto_map and inference-compatible
+    attention settings, copies modeling_llama_te.py, and saves the tokenizer.
+
+    Args:
+        checkpoint_path: Source directory with model.safetensors + config.json.
+        output_path: Destination directory for the eval-ready checkpoint.
+        tokenizer_name: HuggingFace tokenizer name or local path.
+
+    Returns:
+        The output_path string.
+    """
+    from transformers import AutoTokenizer
+
+    from modeling_llama_te import AUTO_MAP
+
+    checkpoint_path_obj = Path(checkpoint_path)
+    output_path_obj = Path(output_path)
+
+    if output_path_obj.resolve() != checkpoint_path_obj.resolve():
+        os.makedirs(output_path, exist_ok=True)
+        for f in checkpoint_path_obj.iterdir():
+            if f.is_file():
+                shutil.copy2(f, output_path_obj / f.name)
+
+    config_file = output_path_obj / "config.json"
+    with open(config_file) as f:
+        config = json.load(f)
+
+    config["auto_map"] = AUTO_MAP
+    config["attn_input_format"] = "bshd"
+    config["self_attn_mask_type"] = "causal"
+
+    with open(config_file, "w") as f:
+        json.dump(config, f, indent=2, sort_keys=True)
+
+    script_dir = Path(__file__).parent
+    shutil.copy2(script_dir / "modeling_llama_te.py", output_path_obj / "modeling_llama_te.py")
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    tokenizer.save_pretrained(str(output_path_obj))
+
+    print(f"Prepared eval directory: {output_path}")
+    return output_path
+
+
+def run_lm_eval(
+    eval_dir: str,
+    tasks: str,
+    batch_size: str,
+    device: str,
+    output_path: str | None = None,
+    num_fewshot: int | None = None,
+) -> float:
+    """Run lm-eval on the prepared checkpoint directory.
+
+    Args:
+        eval_dir: Path to the prepared eval checkpoint directory.
+        tasks: Comma-separated list of lm-eval task names.
+        batch_size: Batch size string (integer or "auto").
+        device: Device string (e.g. "cuda:0").
+        output_path: Optional path to save results JSON.
+        num_fewshot: Optional number of few-shot examples.
+
+    Returns:
+        Wall-clock time in seconds.
+    """
+    cmd = [
+        sys.executable,
+        "-m",
+        "lm_eval",
+        "--model",
+        "hf",
+        "--model_args",
+        f"pretrained={eval_dir},tokenizer={eval_dir}",
+        "--trust_remote_code",
+        "--tasks",
+        tasks,
+        "--device",
+        device,
+        "--batch_size",
+        batch_size,
+    ]
+
+    if output_path:
+        cmd.extend(["--output_path", output_path])
+
+    if num_fewshot is not None:
+        cmd.extend(["--num_fewshot", str(num_fewshot)])
+
+    print(f"\nRunning lm-eval:\n  {' '.join(cmd)}\n")
+    print("=" * 80)
+
+    start_time = time.time()
+    result = subprocess.run(cmd, check=False)
+    elapsed = time.time() - start_time
+
+    print("=" * 80)
+    print(f"\nlm-eval completed in {elapsed:.1f}s ({elapsed / 60:.1f} min)")
+
+    if result.returncode != 0:
+        print(f"lm-eval failed with exit code {result.returncode}", file=sys.stderr)
+        sys.exit(result.returncode)
+
+    return elapsed
+
+
+def main() -> None:
+    """Main entry point."""
+    args = parse_args()
+    checkpoint_path = Path(args.checkpoint_path)
+
+    use_temp = args.eval_dir is None
+    eval_dir = args.eval_dir if args.eval_dir else tempfile.mkdtemp(prefix="lm_eval_checkpoint_")
+
+    if use_temp:
+        print(f"Using temporary eval directory: {eval_dir}")
+
+    try:
+        if args.from_distributed:
+            is_main = export_distributed_checkpoint(
+                checkpoint_path=str(checkpoint_path),
+                model_config=args.model_config,
+                output_path=eval_dir,
+            )
+            if not is_main:
+                return
+            source_dir = eval_dir
+        else:
+            if not (checkpoint_path / "model.safetensors").exists():
+                print(
+                    f"Error: {checkpoint_path / 'model.safetensors'} not found.\n"
+                    "If this is a distributed FSDP2 checkpoint, use --from-distributed with torchrun.\n"
+                    "If this is a final_model directory, ensure it contains model.safetensors.",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+            source_dir = str(checkpoint_path)
+
+        prepare_eval_directory(
+            checkpoint_path=source_dir,
+            output_path=eval_dir,
+            tokenizer_name=args.tokenizer,
+        )
+
+        run_lm_eval(
+            eval_dir=eval_dir,
+            tasks=args.tasks,
+            batch_size=args.batch_size,
+            device=args.device,
+            output_path=args.output_path,
+            num_fewshot=args.num_fewshot,
+        )
+
+    finally:
+        if use_temp and os.path.exists(eval_dir):
+            print(f"\nCleaning up temporary directory: {eval_dir}")
+            shutil.rmtree(eval_dir)
+
+
+if __name__ == "__main__":
+    main()