Tensorboard 생성
학습 로그 저장 경로 확인
- Tensorboard 생성 전, 학습할 Notebook/Pytorchjob 생성 필수
(예시)
LOGDIR = "/home/irteam/tb_logs"
- Tensorboard를 연결하기 위한 코드 주석 참고
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torch.utils.tensorboard import SummaryWriter
# TensorBoard 로그를 "쓰기 가능한 PVC경로"로 저장
# /home -> /home1(심볼릭링크)이고, /home/irteam 아래가 NFS(rw)로 마운트되어 있음
LOGDIR = "/home/irteam/tb_logs"
assert torch.cuda.is_available(), "CUDA GPU not available"
assert torch.cuda.device_count() >= 4, "Need at least 4 GPUs"
device = torch.device("cuda")
torch.backends.cuda.matmul.allow_tf32 = True
print("Visible GPUs:", torch.cuda.device_count())
# run 별로 폴더 생성
run_name = time.strftime("run-%Y%m%d-%H%M%S")
log_path = os.path.join(LOGDIR, run_name)
os.makedirs(log_path, exist_ok=True)
writer = SummaryWriter(log_dir=log_path)
print("TensorBoard logdir:", log_path)
raw = datasets.FashionMNIST(
root="/tmp/data",
train=True,
download=True,
transform=None
)
class FMNISTTensor(torch.utils.data.Dataset):
def __init__(self, base):
self.x = base.data
self.y = base.targets
def __len__(self):
return self.x.size(0)
def __getitem__(self, idx):
x = self.x[idx].unsqueeze(0).float().div_(255.0)
y = self.y[idx]
return x, y
train_dataset = FMNISTTensor(raw)
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=1024, # 총 배치 (GPU당 256)
shuffle=True,
num_workers=0,
pin_memory=True
)
model = nn.Sequential(
nn.Conv2d(1, 64, 3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Flatten(),
nn.Linear(128 * 14 * 14, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
# 4-GPU DataParallel
model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()
print("Start training (4-GPU monitoring test, ~10 min)")
end = time.time() + 600
step = 0
model.train()
while time.time() < end:
for x, y in train_loader:
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
optimizer.zero_grad(set_to_none=True)
with torch.cuda.amp.autocast(dtype=torch.float16):
out = model(x)
loss = criterion(out, y)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
step += 1
# TensorBoard 로깅 (너 목적: 학습 진행 + GPU 메모리 확인)
if step % 10 == 0:
writer.add_scalar("train/loss", float(loss.item()), step)
writer.add_scalar("train/lr", float(optimizer.param_groups[0]["lr"]), step)
# GPU 메모리 메트릭
writer.add_scalar(
"gpu/mem_allocated_MiB",
torch.cuda.memory_allocated() / 1024 / 1024,
step
)
writer.add_scalar(
"gpu/mem_reserved_MiB",
torch.cuda.memory_reserved() / 1024 / 1024,
step
)
writer.add_scalar(
"gpu/max_mem_allocated_MiB",
torch.cuda.max_memory_allocated() / 1024 / 1024,
step
)
if step % 50 == 0:
print(f"step={step}, loss={loss.item():.4f}")
if time.time() >= end:
break
torch.cuda.synchronize()
# 로그 flush/close
writer.flush()
writer.close()
print("Done")
print("TensorBoard logdir:", log_path)
print("Max GPU Mem (MiB):",
torch.cuda.max_memory_allocated() / 1024 / 1024)
TensorBoard Create 버튼 클릭
New TensorBoard 팝업에서 Mount Path 설정
- TensorBoard의 Mount Path는 학습 코드의 LOGDIR 경로와 동일하게 설정
- Tensorboard의 crd와 Pod가 생성되며 Connect 버튼이 활성화
kim@Clush:~$ k get tensorboards
NAME AGE
test 103m
test2 93m # 생성했던 Tensorboard
kim@Clush:~$ k get po
NAME READY STATUS RESTARTS AGE
gpu-burn-3m-worker-0 0/1 Completed 0 2d3h
ml-pipeline-ui-artifact-7cf99c78c4-2422m 2/2 Running 0 3d4h
ml-pipeline-visualizationserver-74dd4d6b77-z2h8t 2/2 Running 0 3d4h
monitoring-test-0 2/2 Running 0 153m
test-1-0 2/2 Running 0 2d7h
test-8794f985-f2x9b 2/2 Running 0 103m
test-gpu-0 2/2 Running 0 2d4h
test-with-config-0 0/2 Pending 0 6h33m
test2-766c68c4cf-x642n 2/2 Running 0 93m
volume-test-1-0 0/2 Pending 0 128m
kim@Clush:~$
Connect 버튼 클릭 후 Tensorboard 활성화