Model Registry
Model Registry 생성
버전 생성
접속정보 확인
API Key 생성
Key 확인
- Model Registry SDK를 사용하기 위해 SDK 설치
pip install "ncloud-mlx[model-registry]" # double quotes are required
- Model Registry에 학습된 Model를 저장하고 다운로드 하기 위해 다음과 같은 예제 코드 사용
# ============================================================
# Fashion-MNIST TinyCNN
# 1) Train
# 2) Save artifacts
# 3) Upload to MLX Model Registry (Notebook-safe async)
# 4) Download from MLX Model Registry (async)
# 5) Load weights and Evaluate on test set
# 6) (Optional) Extra training + re-evaluate
# ============================================================
import os
import time
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from mlx.sdk.model_registry import ModelRegistryAPI
# 공식문서에서 나온 Model Registy 환경 변수 주입 과정이 다음과 같이 사용
# ============================================================
# 0) MLX Model Registry 설정
# ============================================================
MLX_ENDPOINT_URL = "https://kpb4r.mlxp.ncloud.com/model-registry/api/v1"
MLX_APIKEY = "mlx-N2ZhOTZhNzAtZjFkNi0xMWYwLWI5OTctMjQ2ZTk2NTkxYTM4OnloLXRlc3Q6cHVjN2Fra3oyODBkMmdkaQ=="
MLX_PROJECT = "yuhwan-test"
MODEL_NAME = "registry_test"
MODEL_VERSION = "v1"
client = ModelRegistryAPI(MLX_ENDPOINT_URL, MLX_APIKEY)
# ============================================================
# 1) Torch / CUDA 설정
# ============================================================
assert torch.cuda.is_available(), "CUDA GPU not available"
print("Visible GPUs:", torch.cuda.device_count())
device = torch.device("cuda:0")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
# ============================================================
# 2) Dataset helpers
# ============================================================
DATA_ROOT = os.environ.get("DATA_ROOT", "/tmp/data")
class FMNISTTensor(torch.utils.data.Dataset):
def __init__(self, base_ds):
self.x = base_ds.data
self.y = base_ds.targets
def __len__(self):
return self.x.size(0)
def __getitem__(self, idx):
x = self.x[idx].unsqueeze(0).float().div_(255.0)
y = self.y[idx]
return x, y
def make_train_loader(batch_size: int = 2048, num_workers: int = 0):
base = datasets.FashionMNIST(root=DATA_ROOT, train=True, download=True, transform=None)
ds = FMNISTTensor(base)
return torch.utils.data.DataLoader(
ds,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers,
pin_memory=True,
)
def make_test_loader(batch_size: int = 2048, num_workers: int = 0):
base = datasets.FashionMNIST(root=DATA_ROOT, train=False, download=True, transform=None)
ds = FMNISTTensor(base)
return torch.utils.data.DataLoader(
ds,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
)
# ============================================================
# 3) Model
# ============================================================
class TinyCNN(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(1, 32, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2),
nn.Flatten(),
nn.Linear(64 * 7 * 7, 10),
)
def forward(self, x):
return self.net(x)
# ============================================================
# 4) Train
# ============================================================
def train_model(model, train_loader, epochs: int = 10, lr: float = 1e-3, use_amp: bool = True):
model.train()
opt = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
print(f"Start training: epochs={epochs}, batch_size={train_loader.batch_size}, amp={use_amp}")
t0 = time.time()
last_acc = 0.0
last_avg_loss = 0.0
for epoch in range(1, epochs + 1):
running_loss = 0.0
correct = 0
total = 0
epoch_t0 = time.time()
for x, y in train_loader:
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
opt.zero_grad(set_to_none=True)
with torch.cuda.amp.autocast(enabled=use_amp, dtype=torch.float16):
logits = model(x)
loss = loss_fn(logits, y)
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
running_loss += float(loss.item())
correct += int((logits.argmax(1) == y).sum().item())
total += int(y.size(0))
torch.cuda.synchronize()
epoch_time = time.time() - epoch_t0
avg_loss = running_loss / max(1, len(train_loader))
acc = 100.0 * correct / max(1, total)
last_acc = acc
last_avg_loss = avg_loss
print(f"Epoch {epoch:02d}/{epochs} | loss={avg_loss:.4f} | acc={acc:.2f}% | time={epoch_time:.2f}s")
total_time = time.time() - t0
print(f"Done. total_time={total_time:.2f}s")
# 이어학습에 유리하도록 optimizer/scaler 상태도 반환
return {
"final_acc": float(last_acc),
"final_loss": float(last_avg_loss),
"total_time_sec": float(total_time),
"opt_state": opt.state_dict(),
"scaler_state": scaler.state_dict(),
"lr": lr,
"use_amp": use_amp,
}
# ============================================================
# 5) Save artifacts
# ============================================================
def save_artifacts(model, train_summary: dict, logdir: str = "/home/irteam/data-vol1/tb_logs"):
run_name = time.strftime("fmnist_%Y%m%d-%H%M%S")
artifact_dir = os.path.join(logdir, "artifacts", run_name)
os.makedirs(artifact_dir, exist_ok=True)
model_path = os.path.join(artifact_dir, "model.pt")
metrics_path = os.path.join(artifact_dir, "metrics.json")
torch.save(
{
"run_name": run_name,
"model_state_dict": model.state_dict(),
"opt_state_dict": train_summary["opt_state"],
"scaler_state_dict": train_summary["scaler_state"],
"final_acc": train_summary["final_acc"],
"final_loss": train_summary["final_loss"],
"total_time_sec": train_summary["total_time_sec"],
"lr": train_summary["lr"],
"use_amp": train_summary["use_amp"],
},
model_path,
)
with open(metrics_path, "w") as f:
json.dump(
{
"run_name": run_name,
"final_loss": train_summary["final_loss"],
"final_acc": train_summary["final_acc"],
"total_time_sec": train_summary["total_time_sec"],
},
f,
indent=2,
)
print("Saved model:", model_path)
print("Saved metrics:", metrics_path)
return run_name, artifact_dir, model_path, metrics_path
# ============================================================
# 6) Upload / Download (Notebook-safe async)
# ============================================================
async def upload_artifacts(model_path: str, metrics_path: str):
print(f"Uploading to registry: project={MLX_PROJECT}, model={MODEL_NAME}, version={MODEL_VERSION}")
# 모델 파라미터 업로드
await client.file_api.upload(
project_name=MLX_PROJECT,
model_name=MODEL_NAME,
version_name=MODEL_VERSION,
local_path=model_path,
remote_path="/model.pt",
overwrite=True,
)
await client.file_api.upload(
project_name=MLX_PROJECT,
model_name=MODEL_NAME,
version_name=MODEL_VERSION,
local_path=metrics_path,
remote_path="/metrics.json",
overwrite=True,
)
print("Upload finished successfully.")
async def download_artifacts(download_dir: str):
os.makedirs(download_dir, exist_ok=True)
# 모델 파라미터 다운로드
await client.file_api.download(
project_name=MLX_PROJECT,
model_name=MODEL_NAME,
version_name=MODEL_VERSION,
remote_path="/",
local_path=download_dir,
overwrite=True,
)
dl_model = os.path.join(download_dir, "model.pt")
dl_metrics = os.path.join(download_dir, "metrics.json")
# 파일 생성 체크
if not os.path.exists(dl_model):
raise FileNotFoundError(f"Downloaded file not found: {dl_model}")
if not os.path.exists(dl_metrics):
raise FileNotFoundError(f"Downloaded file not found: {dl_metrics}")
print("Downloaded model:", dl_model, "| size:", os.path.getsize(dl_model), "bytes")
print("Downloaded metrics:", dl_metrics, "| size:", os.path.getsize(dl_metrics), "bytes")
return dl_model, dl_metrics
# ============================================================
# 7) Evaluate
# ============================================================
def evaluate_on_test(model, test_loader):
model.eval()
correct, total = 0, 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
logits = model(x)
pred = logits.argmax(1)
correct += int((pred == y).sum().item())
total += int(y.size(0))
acc = 100.0 * correct / max(1, total)
return acc
# ============================================================
# 8) Main flow
# ============================================================
# (A) Train
train_loader = make_train_loader(batch_size=2048, num_workers=0)
model = TinyCNN().to(device)
summary = train_model(model, train_loader, epochs=10, lr=1e-3, use_amp=True)
# (B) Save
run_name, artifact_dir, model_path, metrics_path = save_artifacts(model, summary)
# (C) Upload
await upload_artifacts(model_path, metrics_path)
# (D) Download to a clean temp dir
DOWNLOAD_DIR = f"/tmp/model_registry_download/{MODEL_NAME}/{MODEL_VERSION}/{run_name}"
dl_model_path, dl_metrics_path = await download_artifacts(DOWNLOAD_DIR)
# (E) Load downloaded model + Evaluate on test set
ckpt = torch.load(dl_model_path, map_location="cpu")
loaded = TinyCNN().to(device)
loaded.load_state_dict(ckpt["model_state_dict"], strict=True)
test_loader = make_test_loader(batch_size=2048, num_workers=0)
test_acc = evaluate_on_test(loaded, test_loader)
print(f"Test Accuracy (downloaded model): {test_acc:.2f}%")
# (F) Optional: extra training (fine-tune) + re-evaluate
EPOCHS_MORE = 3
LR_MORE = 5e-4
loaded.train()
opt = optim.Adam(loaded.parameters(), lr=LR_MORE)
loss_fn = nn.CrossEntropyLoss()
# 이전 optimizer/scaler 상태 복원 if "opt_state_dict" in ckpt and isinstance(ckpt["opt_state_dict"], dict):
try:
opt.load_state_dict(ckpt["opt_state_dict"])
print("Restored optimizer state from checkpoint.")
except Exception as e:
print("Could not restore optimizer state:", e)
use_amp_more = True
scaler = torch.cuda.amp.GradScaler(enabled=use_amp_more)
if "scaler_state_dict" in ckpt and isinstance(ckpt["scaler_state_dict"], dict):
try:
scaler.load_state_dict(ckpt["scaler_state_dict"])
print("Restored GradScaler state from checkpoint.")
except Exception as e:
print("Could not restore GradScaler state:", e)
print(f"Extra training: epochs={EPOCHS_MORE}, lr={LR_MORE}, amp={use_amp_more}")
for epoch in range(1, EPOCHS_MORE + 1):
running_loss, correct, total = 0.0, 0, 0
for x, y in train_loader:
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
opt.zero_grad(set_to_none=True)
with torch.cuda.amp.autocast(enabled=use_amp_more, dtype=torch.float16):
logits = loaded(x)
loss = loss_fn(logits, y)
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
running_loss += float(loss.item())
correct += int((logits.argmax(1) == y).sum().item())
total += int(y.size(0))
avg_loss = running_loss / max(1, len(train_loader))
train_acc = 100.0 * correct / max(1, total)
print(f"[MORE] Epoch {epoch:02d}/{EPOCHS_MORE} | loss={avg_loss:.4f} | train_acc={train_acc:.2f}%")
new_test_acc = evaluate_on_test(loaded, test_loader)
print(f"Test Accuracy (after extra training): {new_test_acc:.2f}%")
print("All done.")
실행 결과
Visible GPUs: 1
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw/train-images-idx3-ubyte.gz
100%|██████████| 26421880/26421880 [00:03<00:00, 7375699.50it/s]
Extracting /tmp/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz
100%|██████████| 29515/29515 [00:00<00:00, 112462.49it/s]
Extracting /tmp/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz
100%|██████████| 4422102/4422102 [00:02<00:00, 2114313.55it/s]
Extracting /tmp/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
100%|██████████| 5148/5148 [00:00<00:00, 9673959.23it/s]
Extracting /tmp/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/data/FashionMNIST/raw
Start training: epochs=10, batch_size=2048, amp=True
Epoch 01/10 | loss=1.2257 | acc=63.57% | time=20.76s
Epoch 02/10 | loss=0.6292 | acc=77.07% | time=21.53s
Epoch 03/10 | loss=0.5165 | acc=81.50% | time=24.50s
Epoch 04/10 | loss=0.4590 | acc=83.65% | time=19.70s
Epoch 05/10 | loss=0.4300 | acc=84.62% | time=19.00s
Epoch 06/10 | loss=0.3986 | acc=85.95% | time=19.10s
Epoch 07/10 | loss=0.3840 | acc=86.52% | time=18.90s
Epoch 08/10 | loss=0.3679 | acc=86.91% | time=19.20s
Epoch 09/10 | loss=0.3513 | acc=87.62% | time=23.30s
Epoch 10/10 | loss=0.3430 | acc=87.89% | time=23.00s
Done. total_time=208.99s
Saved model: /home/irteam/data-vol1/tb_logs/artifacts/fmnist_20260211-135212/model.pt
Saved metrics: /home/irteam/data-vol1/tb_logs/artifacts/fmnist_20260211-135212/metrics.json
Uploading to registry: project=yuhwan-test, model=registry_test, version=v1
[2K Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 609.1/609.1 kB • ? • 0:00:00 • 0:00:00
Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 609.1/609.1 kB • ? • 0:00:00 • 0:00:00
[13:52:13] Uploaded ( 609.1 kB) - /model.pt
Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 609.1/609.1 kB • ? • 0:00:00 • 0:00:00
[2K Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 141/141 bytes • ? • 0:00:00 • 0:00:00
Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 141/141 bytes • ? • 0:00:00 • 0:00:00
[13:52:14] Uploaded (141 bytes) - /metrics.json
Uploading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 141/141 bytes • ? • 0:00:00 • 0:00:00
Upload finished successfully.
⠹ Downloading ━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7% • 162.7/609.3 kB • 2.2 MB/s • 0:00:01 • 0:00:00
⠹ Downloading ━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7% • 162.7/609.3 kB • 2.2 MB/s • 0:00:01 • 0:00:00
[13:52:14] Downloaded (141 bytes) -
/tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/metrics.json
⠙ Downloading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0% • 0.0/609.3 kB • ? • -:--:-- • 0:00:00
Downloaded ( 609.1 kB) - /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/model.pt
⠹ Downloading ━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7% • 162.7/609.3 kB • 2.2 MB/s • 0:00:01 • 0:00:00
Downloaded model: /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/model.pt | size: 609110 bytes
Downloaded metrics: /tmp/model_registry_download/registry_test/v1/fmnist_20260211-135212/metrics.json | size: 141 bytes
Test Accuracy (downloaded model): 87.22%
Restored optimizer state from checkpoint.
Restored GradScaler state from checkpoint.
Extra training: epochs=3, lr=0.0005, amp=True
[MORE] Epoch 01/3 | loss=0.3333 | train_acc=88.33%
[MORE] Epoch 02/3 | loss=0.3272 | train_acc=88.47%
[MORE] Epoch 03/3 | loss=0.3198 | train_acc=88.84%
Test Accuracy (after extra training): 87.48%
All done.