Model Registry

Model Registry


Model Registry 생성

  • Model Registry 추가
notion image
notion image
 

버전 생성

notion image
notion image
notion image
notion image
 

접속정보 확인

  • 접속 정보 확인 후 API 생성 필요
notion image

API Key 생성

  • 사용자 환경에 맞게 API Key 이름 작성
notion image
notion image
 

Key 확인

notion image
  • Model Registry SDK를 사용하기 위해 SDK 설치
pip install "ncloud-mlx[model-registry]" # double quotes are required
  • Model Registry에 학습된 Model를 저장하고 다운로드 하기 위해 다음과 같은 예제 코드 사용
# ============================================================ # Fashion-MNIST TinyCNN # 1) Train # 2) Save artifacts # 3) Upload to MLX Model Registry (Notebook-safe async) # 4) Download from MLX Model Registry (async) # 5) Load weights and Evaluate on test set # 6) (Optional) Extra training + re-evaluate # ============================================================ import os import time import json import torch import torch.nn as nn import torch.optim as optim from torchvision import datasets from mlx.sdk.model_registry import ModelRegistryAPI # 공식문서에서 나온 Model Registy 환경 변수 주입 과정이 다음과 같이 사용 # ============================================================ # 0) MLX Model Registry 설정 # ============================================================ MLX_ENDPOINT_URL = "https://kpb4r.mlxp.ncloud.com/model-registry/api/v1" MLX_APIKEY = "mlx-N2ZhOTZhNzAtZjFkNi0xMWYwLWI5OTctMjQ2ZTk2NTkxYTM4OnloLXRlc3Q6cHVjN2Fra3oyODBkMmdkaQ==" MLX_PROJECT = "yuhwan-test" MODEL_NAME = "registry_test" MODEL_VERSION = "v1" client = ModelRegistryAPI(MLX_ENDPOINT_URL, MLX_APIKEY) # ============================================================ # 1) Torch / CUDA 설정 # ============================================================ assert torch.cuda.is_available(), "CUDA GPU not available" print("Visible GPUs:", torch.cuda.device_count()) device = torch.device("cuda:0") torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.benchmark = True # ============================================================ # 2) Dataset helpers # ============================================================ DATA_ROOT = os.environ.get("DATA_ROOT", "/tmp/data") class FMNISTTensor(torch.utils.data.Dataset): def __init__(self, base_ds): self.x = base_ds.data self.y = base_ds.targets def __len__(self): return self.x.size(0) def __getitem__(self, idx): x = self.x[idx].unsqueeze(0).float().div_(255.0) y = self.y[idx] return x, y def make_train_loader(batch_size: int = 2048, num_workers: int = 0): base = datasets.FashionMNIST(root=DATA_ROOT, train=True, download=True, transform=None) ds = FMNISTTensor(base) return torch.utils.data.DataLoader( ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, ) def make_test_loader(batch_size: int = 2048, num_workers: int = 0): base = datasets.FashionMNIST(root=DATA_ROOT, train=False, download=True, transform=None) ds = FMNISTTensor(base) return torch.utils.data.DataLoader( ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, ) # ============================================================ # 3) Model # ============================================================ class TinyCNN(nn.Module): def __init__(self): super().__init__() self.net = nn.Sequential( nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Flatten(), nn.Linear(64 * 7 * 7, 10), ) def forward(self, x): return self.net(x) # ============================================================ # 4) Train # ============================================================ def train_model(model, train_loader, epochs: int = 10, lr: float = 1e-3, use_amp: bool = True): model.train() opt = optim.Adam(model.parameters(), lr=lr) loss_fn = nn.CrossEntropyLoss() scaler = torch.cuda.amp.GradScaler(enabled=use_amp) print(f"Start training: epochs={epochs}, batch_size={train_loader.batch_size}, amp={use_amp}") t0 = time.time() last_acc = 0.0 last_avg_loss = 0.0 for epoch in range(1, epochs + 1): running_loss = 0.0 correct = 0 total = 0 epoch_t0 = time.time() for x, y in train_loader: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) opt.zero_grad(set_to_none=True) with torch.cuda.amp.autocast(enabled=use_amp, dtype=torch.float16): logits = model(x) loss = loss_fn(logits, y) scaler.scale(loss).backward() scaler.step(opt) scaler.update() running_loss += float(loss.item()) correct += int((logits.argmax(1) == y).sum().item()) total += int(y.size(0)) torch.cuda.synchronize() epoch_time = time.time() - epoch_t0 avg_loss = running_loss / max(1, len(train_loader)) acc = 100.0 * correct / max(1, total) last_acc = acc last_avg_loss = avg_loss print(f"Epoch {epoch:02d}/{epochs} | loss={avg_loss:.4f} | acc={acc:.2f}% | time={epoch_time:.2f}s") total_time = time.time() - t0 print(f"Done. total_time={total_time:.2f}s") # 이어학습에 유리하도록 optimizer/scaler 상태도 반환 return { "final_acc": float(last_acc), "final_loss": float(last_avg_loss), "total_time_sec": float(total_time), "opt_state": opt.state_dict(), "scaler_state": scaler.state_dict(), "lr": lr, "use_amp": use_amp, } # ============================================================ # 5) Save artifacts # ============================================================ def save_artifacts(model, train_summary: dict, logdir: str = "/home/irteam/data-vol1/tb_logs"): run_name = time.strftime("fmnist_%Y%m%d-%H%M%S") artifact_dir = os.path.join(logdir, "artifacts", run_name) os.makedirs(artifact_dir, exist_ok=True) model_path = os.path.join(artifact_dir, "model.pt") metrics_path = os.path.join(artifact_dir, "metrics.json") torch.save( { "run_name": run_name, "model_state_dict": model.state_dict(), "opt_state_dict": train_summary["opt_state"], "scaler_state_dict": train_summary["scaler_state"], "final_acc": train_summary["final_acc"], "final_loss": train_summary["final_loss"], "total_time_sec": train_summary["total_time_sec"], "lr": train_summary["lr"], "use_amp": train_summary["use_amp"], }, model_path, ) with open(metrics_path, "w") as f: json.dump( { "run_name": run_name, "final_loss": train_summary["final_loss"], "final_acc": train_summary["final_acc"], "total_time_sec": train_summary["total_time_sec"], }, f, indent=2, ) print("Saved model:", model_path) print("Saved metrics:", metrics_path) return run_name, artifact_dir, model_path, metrics_path # ============================================================ # 6) Upload / Download (Notebook-safe async) # ============================================================ async def upload_artifacts(model_path: str, metrics_path: str): print(f"Uploading to registry: project={MLX_PROJECT}, model={MODEL_NAME}, version={MODEL_VERSION}") # 모델 파라미터 업로드 await client.file_api.upload( project_name=MLX_PROJECT, model_name=MODEL_NAME, version_name=MODEL_VERSION, local_path=model_path, remote_path="/model.pt", overwrite=True, ) await client.file_api.upload( project_name=MLX_PROJECT, model_name=MODEL_NAME, version_name=MODEL_VERSION, local_path=metrics_path, remote_path="/metrics.json", overwrite=True, ) print("Upload finished successfully.") async def download_artifacts(download_dir: str): os.makedirs(download_dir, exist_ok=True) # 모델 파라미터 다운로드 await client.file_api.download( project_name=MLX_PROJECT, model_name=MODEL_NAME, version_name=MODEL_VERSION, remote_path="/", local_path=download_dir, overwrite=True, ) dl_model = os.path.join(download_dir, "model.pt") dl_metrics = os.path.join(download_dir, "metrics.json") # 파일 생성 체크 if not os.path.exists(dl_model): raise FileNotFoundError(f"Downloaded file not found: {dl_model}") if not os.path.exists(dl_metrics): raise FileNotFoundError(f"Downloaded file not found: {dl_metrics}") print("Downloaded model:", dl_model, "| size:", os.path.getsize(dl_model), "bytes") print("Downloaded metrics:", dl_metrics, "| size:", os.path.getsize(dl_metrics), "bytes") return dl_model, dl_metrics # ============================================================ # 7) Evaluate # ============================================================ def evaluate_on_test(model, test_loader): model.eval() correct, total = 0, 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) logits = model(x) pred = logits.argmax(1) correct += int((pred == y).sum().item()) total += int(y.size(0)) acc = 100.0 * correct / max(1, total) return acc # ============================================================ # 8) Main flow # ============================================================ # (A) Train train_loader = make_train_loader(batch_size=2048, num_workers=0) model = TinyCNN().to(device) summary = train_model(model, train_loader, epochs=10, lr=1e-3, use_amp=True) # (B) Save run_name, artifact_dir, model_path, metrics_path = save_artifacts(model, summary) # (C) Upload await upload_artifacts(model_path, metrics_path) # (D) Download to a clean temp dir DOWNLOAD_DIR = f"/tmp/model_registry_download/{MODEL_NAME}/{MODEL_VERSION}/{run_name}" dl_model_path, dl_metrics_path = await download_artifacts(DOWNLOAD_DIR) # (E) Load downloaded model + Evaluate on test set ckpt = torch.load(dl_model_path, map_location="cpu") loaded = TinyCNN().to(device) loaded.load_state_dict(ckpt["model_state_dict"], strict=True) test_loader = make_test_loader(batch_size=2048, num_workers=0) test_acc = evaluate_on_test(loaded, test_loader) print(f"Test Accuracy (downloaded model): {test_acc:.2f}%") # (F) Optional: extra training (fine-tune) + re-evaluate EPOCHS_MORE = 3 LR_MORE = 5e-4 loaded.train() opt = optim.Adam(loaded.parameters(), lr=LR_MORE) loss_fn = nn.CrossEntropyLoss() # 이전 optimizer/scaler 상태 복원 if "opt_state_dict" in ckpt and isinstance(ckpt["opt_state_dict"], dict): try: opt.load_state_dict(ckpt["opt_state_dict"]) print("Restored optimizer state from checkpoint.") except Exception as e: print("Could not restore optimizer state:", e) use_amp_more = True scaler = torch.cuda.amp.GradScaler(enabled=use_amp_more) if "scaler_state_dict" in ckpt and isinstance(ckpt["scaler_state_dict"], dict): try: scaler.load_state_dict(ckpt["scaler_state_dict"]) print("Restored GradScaler state from checkpoint.") except Exception as e: print("Could not restore GradScaler state:", e) print(f"Extra training: epochs={EPOCHS_MORE}, lr={LR_MORE}, amp={use_amp_more}") for epoch in range(1, EPOCHS_MORE + 1): running_loss, correct, total = 0.0, 0, 0 for x, y in train_loader: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) opt.zero_grad(set_to_none=True) with torch.cuda.amp.autocast(enabled=use_amp_more, dtype=torch.float16): logits = loaded(x) loss = loss_fn(logits, y) scaler.scale(loss).backward() scaler.step(opt) scaler.update() running_loss += float(loss.item()) correct += int((logits.argmax(1) == y).sum().item()) total += int(y.size(0)) avg_loss = running_loss / max(1, len(train_loader)) train_acc = 100.0 * correct / max(1, total) print(f"[MORE] Epoch {epoch:02d}/{EPOCHS_MORE} | loss={avg_loss:.4f} | train_acc={train_acc:.2f}%") new_test_acc = evaluate_on_test(loaded, test_loader) print(f"Test Accuracy (after extra training): {new_test_acc:.2f}%") print("All done.")

실행 결과

Visible GPUs: 1 Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw/train-images-idx3-ubyte.gz 100%|██████████| 26421880/26421880 [00:03<00:00, 7375699.50it/s] Extracting /tmp/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz 100%|██████████| 29515/29515 [00:00<00:00, 112462.49it/s] Extracting /tmp/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz 100%|██████████| 4422102/4422102 [00:02<00:00, 2114313.55it/s] Extracting /tmp/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz 100%|██████████| 5148/5148 [00:00<00:00, 9673959.23it/s] Extracting /tmp/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw Start training: epochs=10, batch_size=2048, amp=True Epoch 01/10 | loss=1.2257 | acc=63.57% | time=20.76s Epoch 02/10 | loss=0.6292 | acc=77.07% | time=21.53s Epoch 03/10 | loss=0.5165 | acc=81.50% | time=24.50s Epoch 04/10 | loss=0.4590 | acc=83.65% | time=19.70s Epoch 05/10 | loss=0.4300 | acc=84.62% | time=19.00s Epoch 06/10 | loss=0.3986 | acc=85.95% | time=19.10s Epoch 07/10 | loss=0.3840 | acc=86.52% | time=18.90s Epoch 08/10 | loss=0.3679 | acc=86.91% | time=19.20s Epoch 09/10 | loss=0.3513 | acc=87.62% | time=23.30s Epoch 10/10 | loss=0.3430 | acc=87.89% | time=23.00s Done. total_time=208.99s Saved model: /home/irteam/data-vol1/tb_logs/artifacts/fmnist_20260211-135212/model.pt Saved metrics: /home/irteam/data-vol1/tb_logs/artifacts/fmnist_20260211-135212/metrics.json Uploading to registry: project=yuhwan-test, model=registry_test, version=v1 [2K Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 609.1/609.1 kB • ? • 0:00:00 • 0:00:00 Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 609.1/609.1 kB • ? • 0:00:00 • 0:00:00 [13:52:13] Uploaded ( 609.1 kB) - /model.pt Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 609.1/609.1 kB • ? • 0:00:00 • 0:00:00 [2K Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 141/141 bytes • ? • 0:00:00 • 0:00:00 Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 141/141 bytes • ? • 0:00:00 • 0:00:00 [13:52:14] Uploaded (141 bytes) - /metrics.json Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 141/141 bytes • ? • 0:00:00 • 0:00:00 Upload finished successfully. ⠹ Downloading ━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7% • 162.7/609.3 kB • 2.2 MB/s • 0:00:01 • 0:00:00 ⠹ Downloading ━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7% • 162.7/609.3 kB • 2.2 MB/s • 0:00:01 • 0:00:00 [13:52:14] Downloaded (141 bytes) - /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/metrics.json ⠙ Downloading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0% • 0.0/609.3 kB • ? • -:--:-- • 0:00:00 Downloaded ( 609.1 kB) - /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/model.pt ⠹ Downloading ━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7% • 162.7/609.3 kB • 2.2 MB/s • 0:00:01 • 0:00:00 Downloaded model: /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/model.pt | size: 609110 bytes Downloaded metrics: /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/metrics.json | size: 141 bytes Test Accuracy (downloaded model): 87.22% Restored optimizer state from checkpoint. Restored GradScaler state from checkpoint. Extra training: epochs=3, lr=0.0005, amp=True [MORE] Epoch 01/3 | loss=0.3333 | train_acc=88.33% [MORE] Epoch 02/3 | loss=0.3272 | train_acc=88.47% [MORE] Epoch 03/3 | loss=0.3198 | train_acc=88.84% Test Accuracy (after extra training): 87.48% All done.