Hi. Can anybody help me? Pytorch is using all cores. How do I stop this. I have num_workers=1.
My pytorch code is occupying a lot of CPU memory even thought I am training on GPU.
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name, num_epochs):
Train and evaluate a net.
# Initialize logs
fname = os.path.join(args.model_dir, f'train{fold}.log')
logging_train = myutils.setup_logger(fname)
fname = os.path.join(args.model_dir, f'lr{fold}.log')
logging_lr = myutils.setup_logger(fname)
# Reproducibility
myutils.myseed(seed=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load initial weights
net = net.to(device)
best_net_wts = copy.deepcopy(net.state_dict())
best_acc, epoch = 0.0, 1
# Initialize .tar files to save settings
fname = f'last{fold}.tar'
last_path = os.path.join(args.model_dir, fname)
fname = f'best{fold}.tar'
best_path = os.path.join(args.model_dir, fname)
# To resume training for more epochs
if args.resume:
# Load last settings from .tar file
last_checkpoint = torch.load(last_path)
net.load_state_dict(last_checkpoint['net_state_dict'])
optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
epoch = last_checkpoint['epoch'] + 1 # Since last epoch was saved we start with the next one
logging_process.info(f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}')
# Load best settings from .tar file
best_checkpoint = torch.load(best_path)
best_net_wts = best_checkpoint['net_state_dict']
best_acc = best_checkpoint['acc']
except FileNotFoundError as err:
# This error happens when folds are present
# If interrupted on fold 1 then best best_checkpoint for fold 2 does
# not exists. This is fixed like this.
logging_process.info(f'Model: {args.model_dir}\tError: {err}')
# TRAINING LOOP
for epoch in range(epoch, num_epochs+1):
print(f'Epoch {epoch}/{num_epochs}')
logging_train.info(f'Epoch {epoch}/{num_epochs}')
# Each epoch has a training phase and a validation phase
for phase in ['train','val']:
if phase == 'train':
net.train() # Set net to training mode
mylr_value = optimizer.param_groups[0]['lr']
logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}')
else:
net.eval() # Set net to evaluate mode
# Track statistics
running_loss = 0.0
running_corrects = 0
# Iterate over data
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# Zero the parameter gradients
optimizer.zero_grad()
# Forward
# Track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = net(inputs)
_, targets = torch.max(labels, 1)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, targets)
# Backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# Batch statistics
running_loss += loss.detach().item() * inputs.size(0) # This is batch loss
running_corrects += torch.sum(preds == targets.data) # This is batch accuracy
# efficientnetb
if net_name.startswith('efficientnetb'):
if phase == 'train':
scheduler.step()
# inceptionv
if net_name.startswith('inceptionv'):
if phase == 'train':
if (epoch % 2) == 0:
scheduler.step()
# Epoch statistics
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
if phase == 'val':
# Save last settings to .tar file
torch.save({
'epoch': epoch,
'net_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': epoch_loss
}, last_path)
if epoch_acc > best_acc:
best_acc = epoch_acc
best_net_wts = net.state_dict()
# Save best settings to .tar file
torch.save({
'epoch': epoch,
'net_state_dict': best_net_wts,
'optimizer_state_dict': optimizer.state_dict(),
'loss': epoch_loss,
'acc': best_acc
}, best_path)
# Save best settings to .json file
best_metrics = {
f'loss{fold}': epoch_loss,
f'acc{fold}': best_acc.item()
fname = os.path.join(args.model_dir, f'metrics{fold}.json')
with open (fname, 'w') as f:
f.write(json.dumps(best_metrics))
# vgg
if net_name.startswith('vgg'):
scheduler.step(best_acc)
# resnet
if net_name.startswith('resnet'):
scheduler.step(epoch_loss)
print('Best val Acc: {:4f}'.format(best_acc))
logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(args.model_dir, fold, best_acc))
if __name__ == '__main__':
args = parser.parse_args()
assert os.path.isdir(args.data_dir), "Could not find the dataset at {}".format(args.data_dir)
assert os.path.isdir(args.model_dir), "Could not find the model at {}".format(args.model_dir)
assert os.path.isdir(args.net_dir), "Could not find the network at {}".format(args.net_dir)
# Initialize main log folder
logs_dir_path = os.path.join(os.getcwd(),'Logs')
if not os.path.exists(logs_dir_path):
os.mkdir(logs_dir_path)
# Initialize main log file
log_file = os.path.join(logs_dir_path, 'process.log')
logging_process = myutils.setup_logger(log_file, date=True)
# Save commandline settings to log
script_activated = ' '.join(sys.argv)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging_process.info(f'Script: {script_activated}, device: {device}')
# Get the experiment parameters
params_file = os.path.join(args.model_dir, 'params.json')
assert os.path.isfile(params_file), "No json configuration file found at {}".format(params_file)
params = myutils.Params(params_file)
# FOLD LOOP
dfs = {}
for fold in range(args.fold_start, args.folds+1):
# Load data from .csv files
tname = os.path.join(args.data_dir, 'train.csv')
vname = os.path.join(args.data_dir, 'val.csv')
if args.folds > 1:
# Load data from .csv files
tname = os.path.join(args.data_dir, f'train{fold}.csv')
vname = os.path.join(args.data_dir, f'val{fold}.csv')
train = pd.read_csv(tname)
print(tname)
val = pd.read_csv(vname)
dfs['train'] = train
dfs['val'] = val
mean, std = myutils.get_stats(train, params.size)
logging_process.info(f'Model: {args.model_dir}\tFold: {fold}\tTrain: {tname}\tMean: {mean}\tStd: {std}')
# NETWORK SETTINGS
# Data
loaders = myutils.get_module(args.net_dir, 'loaders')
dataloaders, dataset_sizes = loaders.get_loaders(dfs, mean, std, size=params.size, batch_size=params.batch_size, num_workers=params.num_workers)
# Net
net = myutils.get_network(args.net_dir, params.network)
optimizer = myutils.get_optimizer(params.optimizer, net, params.learning_rate, params.momentum, params.weight_decay)
# Schedulers
if params.network.startswith('vgg'):
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, mode='max')
#efficientnetb
if params.network.startswith('efficientnetb'):
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2.4, gamma=0.97)
#resnet
if params.network.startswith('resnet'):
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, mode='min')
#inceptionv
if params.network.startswith('inception'):
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.94)
# Loss function
weight = myutils.get_weight(train)
weight = weight.to(device)
criterion = myutils.get_loss_fn(args.net_dir, params.network, weight)
logging_process.info(f'Model: {args.model_dir}\tFile: train{fold}.csv\tWeight: {weight}')
# Train
print(f'Fold {fold}')
print('-'*10)
logging_process.info(f'Model: {args.model_dir}\tFold: {fold}, training has started for {params.num_epochs} epochs')
train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name=params.network, num_epochs=params.num_epochs)
logging_process.info(f'Model: {args.model_dir}\tFold: {fold}, training has ended')
Yes. I am seeing around 30% GPU usage with gpustat. It varies from 0% to 30%.
I was using these settings to train.
"size":224,
"batch_size":32,
"num_workers":1,
"network":"vgg16_ft",
"optimizer":"SGD",
"momentum":0.9,
"weight_decay":0.005,
"learning_rate":0.01,
"num_epochs":50
I tried the tests in isolation. The leak happens in the data loading. This is my dataloader code:
import torch
import torchvision
import myutils
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
def get_loaders(dfs, mean, std, size, batch_size, num_workers):
Function that takes a dictionary of dataframes and
returns 2 dictionaries of pytorch dataloaders and dataset_sizes
# Reproducibility
myutils.myseed(seed=42)
# Custom pytorch dataloader for this dataset
class Derm(Dataset):
Read a pandas dataframe with
images paths and labels
def __init__(self, df, transform=None):
self.df = df
self.transform = transform
def __len__(self):
return len(self.df)
def __getitem__(self, index):
# Load image data and get label
X = Image.open(self.df['filenames'][index]).convert('RGB')
y = torch.tensor(self.df.iloc[index,2:])
except IOError as err:
if self.transform:
X = self.transform(X)
# Sanity check
#print(info('id:', self.df['id'][index], 'label', y)
return index, X, y
# Transforms
data_transforms = {'train' : transforms.Compose([transforms.Resize(size),
transforms.CenterCrop((size,size)),
transforms.ToTensor(),
transforms.Normalize(mean,std)]),
'val' : transforms.Compose([transforms.Resize(size),
transforms.CenterCrop((size,size)),
transforms.ToTensor(),
transforms.Normalize(mean,std)]),
'test' : transforms.Compose([transforms.Resize(size),
transforms.CenterCrop((size,size)),
transforms.ToTensor(),
transforms.Normalize(mean,std)]),
'unknown' : transforms.Compose([transforms.Resize(size),
transforms.CenterCrop((size,size)),
transforms.ToTensor(),
transforms.Normalize(mean,std)])}
# Sets
image_datasets = {x: Derm(dfs[x], transform=data_transforms[x]) for x in dfs.keys()}
# Sizes
dataset_sizes = {x: len(image_datasets[x]) for x in dfs.keys()}
# Loaders
dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers, pin_memory=False) for x in dfs.keys()}
return dataloaders, dataset_sizes
I also deleted:
best_net_wts = copy.deepcopy(net.state_dict())
because is a variable that I was not using. This helped a bit but the overhead is still high.
Like PIL seams to use all CPU cores.
I don’t know, if PIL is able to use all cores for the processing, but you could verify it by running PIL transformations in isolation and check the CPU usage.
Did you replace the “vanilla” PIL with any other drop-in library?
CC @fmassa: in case you’ve seen this behavior before.
Thanks for the isolation advice. It did came down to the PIL Image opening using all the cores. I read that is a python issue more than PIL.
I had conda install pillow=6.2.1
And I did found a very similar issue open in github with Pillow:
github.com/python-pillow/Pillow
Memory is slowly leaking when process many image files one by one.
Leak size:
100K images --> ~100MB
300K images --> ~200MB
1M images -->...
Memory
NumPy
Platform
I found a solution:
https://pillow.readthedocs.io/en/stable/installation.html?highlight=cpu
Environment variable: MAX_CONCURRENCY=n . Pillow can use multiprocessing to build the extension. Setting MAX_CONCURRENCY sets the number of CPUs to use, or can disable parallel building by using a setting of 1. By default, it uses 4 CPUs, or if 4 are not available, as many as are present.
Also I did change all resizing and preprocessing outside torch.