本文整理汇总了Python中torch.distributed.get_world_size函数的典型用法代码示例。如果您正苦于以下问题:Python get_world_size函数的具体用法?Python get_world_size怎么用?Python get_world_size使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_world_size函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: flat_dist_call
def flat_dist_call(tensors, call, extra_args=None):
flat_dist_call.warn_on_half = True
buckets = {}
for tensor in tensors:
tp = tensor.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(tensor)
if flat_dist_call.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
flat_dist_call.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
coalesced = _flatten_dense_tensors(bucket)
if extra_args is not None:
call(coalesced, *extra_args)
else:
call(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
buf.copy_(synced)
开发者ID:Henley13,项目名称:imagenet-fast,代码行数:25,代码来源:distributed.py
示例2: _process_batch
def _process_batch():
dev_grad_batch, dev_events, job_event = queue.get()
dev_coalesced = []
# Coalesce the tensors on all devices and start a local reduction
for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
with torch.cuda.device(dev_id), torch.cuda.stream(stream):
stream.wait_event(event)
coalesced = _flatten_tensors(grad_batch)
dev_coalesced.append(coalesced)
# Wait for all copies to complete before starting the NCCL kernel
for stream in reduction_streams:
stream.synchronize()
nccl.reduce(dev_coalesced, root=device_ids[0], streams=nccl_streams)
# From now on we're only going to work on the first device (from device_ids)
grad_batch = dev_grad_batch[0]
coalesced = dev_coalesced[0]
reduce_stream = reduction_streams[0]
with torch.cuda.stream(reduce_stream):
reduce_stream.wait_stream(nccl_streams[0])
coalesced /= dist.get_world_size()
dist.all_reduce(coalesced, group=group_id)
for grad, reduced in zip(grad_batch, _unflatten_tensors(coalesced, grad_batch)):
grad.copy_(reduced)
job_event.set()
开发者ID:athiwatp,项目名称:pytorch,代码行数:25,代码来源:distributed.py
示例3: test_send_recv
def test_send_recv(self):
rank = dist.get_rank()
tensor = _build_tensor(rank + 1)
for dest in range(0, dist.get_world_size()):
if dest == rank:
continue
dist.send(tensor, dest)
for src in range(0, dist.get_world_size()):
if src == rank:
continue
tensor = _build_tensor(src + 1, value=-1)
expected_tensor = _build_tensor(src + 1)
dist.recv(tensor, src)
self.assertEqual(tensor, expected_tensor)
self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:17,代码来源:test_distributed.py
示例4: test_send_recv_any_source
def test_send_recv_any_source(self):
rank = dist.get_rank()
tensor = _build_tensor(10, rank)
for dest in range(0, dist.get_world_size()):
if dest == rank:
continue
dist.send(tensor, dest)
recv_ranks = set()
for src in range(0, dist.get_world_size()):
if src == rank:
continue
tensor = _build_tensor(10, value=-1)
dist.recv(tensor)
recv_ranks.add(tensor.resize_(1)[0])
self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:18,代码来源:test_distributed.py
示例5: reduction_fn_nccl
def reduction_fn_nccl():
# This function only needs to be called once
if not self.need_reduction:
return
self.need_reduction = False
all_grads = [[] for _ in range(len(self._module_copies))]
all_grads_buckets_iters = []
# Bucketing all the gradients
for dev_idx, module in enumerate(self._module_copies):
for param in module.parameters():
if not param.requires_grad or param.grad is None:
continue
if param.grad.requires_grad:
raise RuntimeError("DistributedDataParallel only works "
"with gradients that don't require "
"grad")
# Adding the gradients for reduction
all_grads[dev_idx].append(param.grad.data)
# Now bucketing the parameters
dev_grads_buckets = _take_tensors(all_grads[dev_idx],
self.nccl_reduce_bucket_size)
all_grads_buckets_iters.append(dev_grads_buckets)
# Now reduce each bucket one after another
for grads_batch in zip(*all_grads_buckets_iters):
grads_batch_coalesced = []
# Coalesce each bucket
for dev_idx, dev_grads_batch in enumerate(grads_batch):
dev_id = self.device_ids[dev_idx]
with torch.cuda.device(dev_id):
dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
grads_batch_coalesced.append(dev_grads_batch_coalesced)
# We will only use device 0's results, but this single op should be
# faster than doing the following two operation sequentially:
# (1) intra-node reduce to lead GPU, followed by
# (2) inter-node allreduce for all the first lead GPUs in all nodes
dist.all_reduce_multigpu(grads_batch_coalesced,
group=self.nccl_reduction_group_id)
# Now only work on the first device of self.device_ids, uncoalesce
# the gradients for each bucket
grads_batch_coalesced[0] /= dist.get_world_size()
grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
grad.copy_(reduced)
# clear the gradients and save memory for replicas
for module in self._module_copies[1:]:
for param in module.parameters():
if param.requires_grad:
param.grad = None
param.data.set_()
开发者ID:inkawhich,项目名称:pytorch,代码行数:57,代码来源:distributed.py
示例6: __init__
def __init__(self, num_replicas=None, rank=None):
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.extra = 0
开发者ID:AhlamMD,项目名称:decaNLP,代码行数:9,代码来源:iterator.py
示例7: test_mpi
def test_mpi():
dist.init_process_group('mpi')
world_size = dist.get_world_size()
rank = dist.get_rank()
vector = [0] * world_size
vector[rank] = 1
vector = torch.DoubleTensor(vector)
dist.all_reduce(vector, op=dist.reduce_op.SUM)
print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
开发者ID:mlbench,项目名称:mlbench,代码行数:11,代码来源:main.py
示例8: __init__
def __init__(self, dataset, num_replicas=None, rank=None):
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
开发者ID:athiwatp,项目名称:pytorch,代码行数:11,代码来源:distributed.py
示例9: allreduce_params
def allreduce_params():
if self.needs_reduction:
self.needs_reduction = False
buckets = defaultdict(list)
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
buckets[tp].append(param)
for bucket in buckets.values():
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
开发者ID:RichieMay,项目名称:pytorch,代码行数:16,代码来源:distributed_cpu.py
示例10: _init_multigpu_helper
def _init_multigpu_helper(self):
"""Multigpu tests are designed to simulate the multi nodes with multi
GPUs on each node. Nccl backend requires equal #GPUs in each process.
On a single node, all visible GPUs are evenly
divided to subsets, each process only uses a subset.
"""
nGPUs = torch.cuda.device_count()
world_size = dist.get_world_size()
visible_devices = range(nGPUs)
if BACKEND == 'nccl':
apply_hack_for_nccl()
nGPUs_per_process = nGPUs // world_size
rank_to_GPU = {i: list(visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process])
for i in range(world_size)}
return rank_to_GPU
开发者ID:xiongyw,项目名称:pytorch,代码行数:17,代码来源:test_distributed.py
示例11: config_pytorch
def config_pytorch(options):
"""Config pytorch packages.
Fix random number for packages and initialize distributed environment for pytorch.
Setup cuda environment for pytorch.
:param options: A global object containing specified options.
:type options: argparse.Namespace
"""
# Setting `cudnn.deterministic = True` will turn on
# CUDNN deterministic setting which can slow down training considerably.
# Unexpected behavior may also be observed from checkpoint.
# See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py
if options.cudnn_deterministic:
cudnn.deterministic = True
log.warning('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.', 0)
if options.seed is not None:
random.seed(options.seed)
torch.manual_seed(options.seed)
# define the graph for the computation.
if options.use_cuda:
assert torch.cuda.is_available()
options.rank = dist.get_rank()
options.world_size = dist.get_world_size()
options.graph = FCGraph(options)
# enable cudnn accelerator if we are using cuda.
if options.use_cuda:
options.graph.assigned_gpu_id()
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
if torch.backends.cudnn.version() is None:
log.warning("CUDNN not found on device.")
log.info("World size={}, Rank={}, hostname={}, cuda_available={}, cuda_device={}".format(
options.world_size, options.rank, socket.gethostname(), torch.cuda.is_available(),
torch.cuda.current_device()))
开发者ID:mlbench,项目名称:mlbench,代码行数:46,代码来源:config.py
示例12: __init__
def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
"""
Samples batches assuming they are in order of size to batch similarly sized samples together.
"""
super(DistributedBucketingSampler, self).__init__(data_source)
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.data_source = data_source
self.ids = list(range(0, len(data_source)))
self.batch_size = batch_size
self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
self.num_replicas = num_replicas
self.rank = rank
self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
开发者ID:vamsimynam,项目名称:deepspeech.pytorch,代码行数:17,代码来源:data_loader.py
示例13: test_isend
def test_isend(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
requests = [
dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
]
for request in requests:
request.wait()
self.assertTrue(request.is_completed())
else:
tensor = _build_tensor(rank, -1)
dist.recv(tensor, 0)
self.assertEqual(tensor, _build_tensor(rank, 10))
self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:17,代码来源:test_distributed.py
示例14: test_irecv
def test_irecv(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
requests = [
dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
]
for src in range(1, world_size):
requests[src - 1].wait()
self.assertTrue(requests[src - 1].is_completed())
self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
else:
tensor = _build_tensor(rank, 10)
dist.send(tensor, 0)
self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:19,代码来源:test_distributed.py
示例15: test_get_rank
def test_get_rank(self):
test_dir = os.path.join(TEMP_DIR, 'test_dir')
pid = str(os.getpid())
num_processes = dist.get_world_size()
with open(os.path.join(test_dir, pid), 'w') as f:
f.write(str(dist.get_rank()))
self._barrier()
all_ranks = set()
for f_name in os.listdir(test_dir):
with open(os.path.join(test_dir, f_name), 'r') as f:
all_ranks.add(int(f.read()))
self.assertEqual(len(all_ranks), num_processes)
self._barrier()
if dist.get_rank() == 0:
for f_name in os.listdir(test_dir):
os.unlink(os.path.join(test_dir, f_name))
self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:22,代码来源:test_distributed.py
示例16: sync
def sync(cls, timeout=5):
cls.barrier_id += 1
barrier_dir = os.path.join(TEMP_DIR, 'barrier')
pid = str(os.getpid())
barrier_file = os.path.join(barrier_dir, pid)
with _lock():
with open(barrier_file, 'w') as f:
f.write(str(cls.barrier_id))
start_time = time.time()
while True:
arrived = 0
with _lock():
for f_name in os.listdir(barrier_dir):
with open(os.path.join(barrier_dir, f_name), 'r') as f:
data = f.read()
if int(data) >= cls.barrier_id:
arrived += 1
if arrived == dist.get_world_size():
break
if time.time() - start_time > timeout:
raise RuntimeError("barrier timeout")
time.sleep(0.1)
开发者ID:athiwatp,项目名称:pytorch,代码行数:24,代码来源:test_distributed.py
示例17: _init_multigpu_helper
def _init_multigpu_helper(self):
"""Multigpu tests are designed to simulate the multi nodes with multi
GPUs on each node. Nccl backend requires equal #GPUs in each process.
On a single node, all visible GPUs are evenly
divided to subsets, each process only uses a subset.
"""
nGPUs = torch.cuda.device_count()
world_size = dist.get_world_size()
visible_devices = range(nGPUs)
# This is a hack for a known NCCL issue using multiprocess
# in conjunction with multiple threads to manage different GPUs which
# may cause ncclCommInitRank to fail.
# http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4
# It slows down the performance of collective operations.
# Without this setting NCCL might throw unhandled error.
os.environ['NCCL_MAX_NRINGS'] = '1'
nGPUs_per_process = int(nGPUs / world_size)
rankToGPUMapping = {}
for i in range(world_size):
rankToGPUMapping[i] = visible_devices[
i * nGPUs_per_process: (i + 1) * nGPUs_per_process]
return rankToGPUMapping
开发者ID:Jsmilemsj,项目名称:pytorch,代码行数:24,代码来源:test_distributed.py
示例18: allreduce_params
def allreduce_params():
if (self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
开发者ID:vamsimynam,项目名称:deepspeech.pytorch,代码行数:24,代码来源:distributed.py
示例19: _train
def _train(args):
is_distributed = len(args.hosts) > 1 and args.dist_backend is not None
logger.debug("Distributed training - {}".format(is_distributed))
if is_distributed:
# Initialize the distributed environment.
world_size = len(args.hosts)
os.environ['WORLD_SIZE'] = str(world_size)
host_rank = args.hosts.index(args.current_host)
dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size)
logger.info(
'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
args.dist_backend,
dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
dist.get_rank(), torch.cuda.is_available(), args.num_gpus))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logger.info("Device Type: {}".format(device))
logger.info("Loading Cifar10 dataset")
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root=args.data_dir, train=True,
download=False, transform=transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
shuffle=True, num_workers=args.workers)
testset = torchvision.datasets.CIFAR10(root=args.data_dir, train=False,
download=False, transform=transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size,
shuffle=False, num_workers=args.workers)
logger.info("Model loaded")
model = Net()
if torch.cuda.device_count() > 1:
logger.info("Gpu count: {}".format(torch.cuda.device_count()))
model = nn.DataParallel(model)
model = model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(0, args.epochs):
running_loss = 0.0
for i, data in enumerate(train_loader):
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')
return _save_model(model, args.model_dir)
开发者ID:ingonader,项目名称:amazon-sagemaker-examples-clone,代码行数:70,代码来源:cifar10.py
示例20: test_synchronize_sgd
def test_synchronize_sgd():
torch.manual_seed(42)
dist.init_process_group('mpi')
rank = dist.get_rank()
world_size = dist.get_world_size()
device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
x = x[rank::world_size]
y = y[rank::world_size]
# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y using operations on Tensors. Since w1 and
# w2 have requires_grad=True, operations involving these Tensors will cause
# PyTorch to build a computational graph, allowing automatic computation of
# gradients. Since we are no longer implementing the backward pass by hand we
# don't need to keep references to intermediate values.
y_pred = x.mm(w1).clamp(min=0).mm(w2)
# Compute and print loss. Loss is a Tensor of shape (), and loss.item()
# is a Python number giving its value.
loss = (y_pred - y).pow(2).sum()
if rank == 0:
print("Iter {} : {:10.3e}".format(t, loss.item()))
# Use autograd to compute the backward pass. This call will compute the
# gradient of loss with respect to all Tensors with requires_grad=True.
# After this call w1.grad and w2.grad will be Tensors holding the gradient
# of the loss with respect to w1 and w2 respectively.
loss.backward()
# Update weights using gradient descent. For this step we just want to mutate
# the values of w1 and w2 in-place; we don't want to build up a computational
# graph for the update steps, so we use the torch.no_grad() context manager
# to prevent PyTorch from building a computational graph for the updates
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
# Manually zero the gradients after running the backward pass
w1.grad.zero_()
w2.grad.zero_()
# Synchronize weights
dist.all_reduce(w1, op=dist.reduce_op.SUM)
dist.all_reduce(w2, op=dist.reduce_op.SUM)
w1 /= world_size
w2 /= world_size
开发者ID:mlbench,项目名称:mlbench,代码行数:64,代码来源:main.py
注:本文中的torch.distributed.get_world_size函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论