/* Perform a raw tape operation on remote tape connection HANDLE.
Return the results of the ioctl, or -1 on error. */
int
rmt_ioctl__ (int handle, int operation, char *argument)
{
switch (operation)
{
default:
errno = EOPNOTSUPP;
return -1;
#ifdef MTIOCTOP
case MTIOCTOP:
{
char command_buffer[COMMAND_BUFFER_SIZE];
char operand_buffer[UINTMAX_STRSIZE_BOUND];
uintmax_t u = (((struct mtop *) argument)->mt_count < 0
? - (uintmax_t) ((struct mtop *) argument)->mt_count
: (uintmax_t) ((struct mtop *) argument)->mt_count);
char *p = operand_buffer + sizeof operand_buffer;
*--p = 0;
do
*--p = '0' + (int) (u % 10);
while ((u /= 10) != 0);
if (((struct mtop *) argument)->mt_count < 0)
*--p = '-';
/* MTIOCTOP is the easy one. Nothing is transferred in binary. */
sprintf (command_buffer, "I%d\n%s\n",
((struct mtop *) argument)->mt_op, p);
if (do_command (handle, command_buffer) == -1)
return -1;
return get_status (handle);
}
#endif /* MTIOCTOP */
#ifdef MTIOCGET
case MTIOCGET:
{
ssize_t status;
size_t counter;
/* Grab the status and read it directly into the structure. This
assumes that the status buffer is not padded and that 2 shorts
fit in a long without any word alignment problems; i.e., the
whole struct is contiguous. NOTE - this is probably NOT a good
assumption. */
if (do_command (handle, "S") == -1
|| (status = get_status (handle), status == -1))
return -1;
if (status > sizeof (struct mtop))
{
errno = EOVERFLOW;
return -1;
}
for (; status > 0; status -= counter, argument += counter)
{
counter = safe_read (READ_SIDE (handle), argument, status);
if (counter == SAFE_READ_ERROR || counter == 0)
{
_rmt_shutdown (handle, EIO);
return -1;
}
}
/* Check for byte position. mt_type (or mt_model) is a small integer
field (normally) so we will check its magnitude. If it is larger
than 256, we will assume that the bytes are swapped and go through
and reverse all the bytes. */
if (((struct mtget *) argument)->MTIO_CHECK_FIELD < 256)
return 0;
for (counter = 0; counter < status; counter += 2)
{
char copy = argument[counter];
argument[counter] = argument[counter + 1];
argument[counter + 1] = copy;
}
return 0;
}
#endif /* MTIOCGET */
}
}
static int
_handle_signal_task_local(int fd, slurmd_job_t *job, uid_t uid)
{
int rc = SLURM_SUCCESS;
int signal;
int ltaskid; /* local task index */
debug("_handle_signal_task_local for job %u.%u",
job->jobid, job->stepid);
safe_read(fd, &signal, sizeof(int));
safe_read(fd, <askid, sizeof(int));
debug3(" uid = %d", uid);
if (uid != job->uid && !_slurm_authorized_user(uid)) {
debug("kill req from uid %ld for job %u.%u owned by uid %ld",
(long)uid, job->jobid, job->stepid, (long)job->uid);
rc = EPERM;
goto done;
}
/*
* Sanity checks
*/
if (ltaskid < 0 || ltaskid >= job->node_tasks) {
debug("step %u.%u invalid local task id %d",
job->jobid, job->stepid, ltaskid);
rc = SLURM_ERROR;
goto done;
}
if (!job->task
|| !job->task[ltaskid]) {
debug("step %u.%u no task info for task id %d",
job->jobid, job->stepid, ltaskid);
rc = SLURM_ERROR;
goto done;
}
if (job->task[ltaskid]->pid <= 1) {
debug("step %u.%u invalid pid %d for task %d",
job->jobid, job->stepid,
job->task[ltaskid]->pid, ltaskid);
rc = SLURM_ERROR;
goto done;
}
/*
* Signal the task
*/
pthread_mutex_lock(&suspend_mutex);
if (suspended) {
rc = ESLURMD_STEP_SUSPENDED;
pthread_mutex_unlock(&suspend_mutex);
goto done;
}
if (kill(job->task[ltaskid]->pid, signal) == -1) {
rc = -1;
verbose("Error sending signal %d to %u.%u, pid %d: %m",
signal, job->jobid, job->stepid,
job->task[ltaskid]->pid);
} else {
verbose("Sent signal %d to %u.%u, pid %d",
signal, job->jobid, job->stepid,
job->task[ltaskid]->pid);
}
pthread_mutex_unlock(&suspend_mutex);
done:
/* Send the return code */
safe_write(fd, &rc, sizeof(int));
return SLURM_SUCCESS;
rwfail:
return SLURM_FAILURE;
}
开发者ID:VURM,项目名称:slurm,代码行数:74,代码来源:req.c
示例3: _handle_checkpoint_tasks
static int
_handle_checkpoint_tasks(int fd, slurmd_job_t *job, uid_t uid)
{
int rc = SLURM_SUCCESS;
time_t timestamp;
int len;
char *image_dir = NULL;
debug3("_handle_checkpoint_tasks for job %u.%u",
job->jobid, job->stepid);
safe_read(fd, ×tamp, sizeof(time_t));
safe_read(fd, &len, sizeof(int));
if (len) {
image_dir = xmalloc (len);
safe_read(fd, image_dir, len); /* '\0' terminated */
}
debug3(" uid = %d", uid);
if (uid != job->uid && !_slurm_authorized_user(uid)) {
debug("checkpoint req from uid %ld for job %u.%u "
"owned by uid %ld",
(long)uid, job->jobid, job->stepid, (long)job->uid);
rc = EPERM;
goto done;
}
if (job->ckpt_timestamp &&
timestamp == job->ckpt_timestamp) {
debug("duplicate checkpoint req for job %u.%u, "
"timestamp %ld. discarded.",
job->jobid, job->stepid, (long)timestamp);
rc = ESLURM_ALREADY_DONE; /* EINPROGRESS? */
goto done;
}
/*
* Sanity checks
*/
if (job->pgid <= (pid_t)1) {
debug ("step %u.%u invalid [jmgr_pid:%d pgid:%u]",
job->jobid, job->stepid, job->jmgr_pid, job->pgid);
rc = ESLURMD_JOB_NOTRUNNING;
goto done;
}
/*
* Signal the process group
*/
pthread_mutex_lock(&suspend_mutex);
if (suspended) {
rc = ESLURMD_STEP_SUSPENDED;
pthread_mutex_unlock(&suspend_mutex);
goto done;
}
/* set timestamp in case another request comes */
job->ckpt_timestamp = timestamp;
/* TODO: do we need job->ckpt_dir any more,
* except for checkpoint/xlch? */
/* if (! image_dir) { */
/* image_dir = xstrdup(job->ckpt_dir); */
/* } */
/* call the plugin to send the request */
if (checkpoint_signal_tasks(job, image_dir) != SLURM_SUCCESS) {
rc = -1;
verbose("Error sending checkpoint request to %u.%u: %s",
job->jobid, job->stepid, slurm_strerror(rc));
} else {
verbose("Sent checkpoint request to %u.%u",
job->jobid, job->stepid);
}
pthread_mutex_unlock(&suspend_mutex);
done:
/* Send the return code */
safe_write(fd, &rc, sizeof(int));
xfree(image_dir);
return SLURM_SUCCESS;
rwfail:
return SLURM_FAILURE;
}
开发者ID:VURM,项目名称:slurm,代码行数:85,代码来源:req.c
示例4: _handle_attach
static int
_handle_attach(int fd, slurmd_job_t *job, uid_t uid)
{
srun_info_t *srun;
int rc = SLURM_SUCCESS;
debug("_handle_attach for job %u.%u", job->jobid, job->stepid);
srun = xmalloc(sizeof(srun_info_t));
srun->key = (srun_key_t *)xmalloc(SLURM_IO_KEY_SIZE);
debug("sizeof(srun_info_t) = %d, sizeof(slurm_addr_t) = %d",
(int) sizeof(srun_info_t), (int) sizeof(slurm_addr_t));
safe_read(fd, &srun->ioaddr, sizeof(slurm_addr_t));
safe_read(fd, &srun->resp_addr, sizeof(slurm_addr_t));
safe_read(fd, srun->key, SLURM_IO_KEY_SIZE);
/*
* Check if jobstep is actually running.
*/
if (job->state != SLURMSTEPD_STEP_RUNNING) {
rc = ESLURMD_JOB_NOTRUNNING;
goto done;
}
/*
* At the moment, it only makes sense for the slurmd to make this
* call, so only _slurm_authorized_user is allowed.
*/
if (!_slurm_authorized_user(uid)) {
error("uid %ld attempt to attach to job %u.%u owned by %ld",
(long) uid, job->jobid, job->stepid, (long)job->uid);
rc = EPERM;
goto done;
}
list_prepend(job->sruns, (void *) srun);
rc = io_client_connect(srun, job);
debug(" back from io_client_connect, rc = %d", rc);
done:
/* Send the return code */
safe_write(fd, &rc, sizeof(int));
debug(" in _handle_attach rc = %d", rc);
if (rc == SLURM_SUCCESS) {
/* Send response info */
uint32_t *pids, *gtids;
int len, i;
debug(" in _handle_attach sending response info");
len = job->node_tasks * sizeof(uint32_t);
pids = xmalloc(len);
gtids = xmalloc(len);
if (job->task != NULL) {
for (i = 0; i < job->node_tasks; i++) {
if (job->task[i] == NULL)
continue;
pids[i] = (uint32_t)job->task[i]->pid;
gtids[i] = job->task[i]->gtid;
}
}
safe_write(fd, &job->node_tasks, sizeof(uint32_t));
safe_write(fd, pids, len);
safe_write(fd, gtids, len);
xfree(pids);
xfree(gtids);
for (i = 0; i < job->node_tasks; i++) {
len = strlen(job->task[i]->argv[0]) + 1;
safe_write(fd, &len, sizeof(int));
safe_write(fd, job->task[i]->argv[0], len);
}
}
return SLURM_SUCCESS;
rwfail:
return SLURM_FAILURE;
}
开发者ID:VURM,项目名称:slurm,代码行数:80,代码来源:req.c
示例5: _handle_accept
static void *
_handle_accept(void *arg)
{
/*struct request_params *param = (struct request_params *)arg;*/
int fd = ((struct request_params *)arg)->fd;
slurmd_job_t *job = ((struct request_params *)arg)->job;
int req;
int len;
Buf buffer;
void *auth_cred;
int rc;
uid_t uid;
gid_t gid;
debug3("Entering _handle_accept (new thread)");
xfree(arg);
safe_read(fd, &req, sizeof(int));
if (req != REQUEST_CONNECT) {
error("First message must be REQUEST_CONNECT");
goto fail;
}
safe_read(fd, &len, sizeof(int));
buffer = init_buf(len);
safe_read(fd, get_buf_data(buffer), len);
/* Unpack and verify the auth credential */
auth_cred = g_slurm_auth_unpack(buffer);
if (auth_cred == NULL) {
error("Unpacking authentication credential: %s",
g_slurm_auth_errstr(g_slurm_auth_errno(NULL)));
free_buf(buffer);
goto fail;
}
rc = g_slurm_auth_verify(auth_cred, NULL, 2, NULL);
if (rc != SLURM_SUCCESS) {
error("Verifying authentication credential: %s",
g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred)));
(void) g_slurm_auth_destroy(auth_cred);
free_buf(buffer);
goto fail;
}
/* Get the uid & gid from the credential, then destroy it. */
uid = g_slurm_auth_get_uid(auth_cred, NULL);
gid = g_slurm_auth_get_gid(auth_cred, NULL);
debug3(" Identity: uid=%d, gid=%d", uid, gid);
g_slurm_auth_destroy(auth_cred);
free_buf(buffer);
rc = SLURM_SUCCESS;
safe_write(fd, &rc, sizeof(int));
while (1) {
rc = _handle_request(fd, job, uid, gid);
if (rc != SLURM_SUCCESS)
break;
}
if (close(fd) == -1)
error("Closing accepted fd: %m");
pthread_mutex_lock(&message_lock);
message_connections--;
pthread_cond_signal(&message_cond);
pthread_mutex_unlock(&message_lock);
debug3("Leaving _handle_accept");
return NULL;
fail:
rc = SLURM_FAILURE;
safe_write(fd, &rc, sizeof(int));
rwfail:
if (close(fd) == -1)
error("Closing accepted fd after error: %m");
debug("Leaving _handle_accept on an error");
return NULL;
}
开发者ID:VURM,项目名称:slurm,代码行数:80,代码来源:req.c
示例6: read_key
//.........这里部分代码省略.........
/* '[','1',';','3','B' |0x80,KEYCODE_ALT_DOWN , - unugsed */
'[','1',';','3','C' |0x80,KEYCODE_ALT_RIGHT,
'[','1',';','3','D' |0x80,KEYCODE_ALT_LEFT ,
/* '[','3',';','3','~' |0x80,KEYCODE_ALT_DELETE, - unugsed */
0
};
pfd.fd = fd;
pfd.events = POLLIN;
buffer++; /* saved chars counter is in buffer[-1] now */
start_over:
errno = 0;
n = (unsigned char)buffer[-1];
if (n == 0) {
/* If no data, wait for input.
* If requested, wait TIMEOUT ms. TIMEOUT = -1 is useful
* if fd can be in non-blocking mode.
*/
if (timeout >= -1) {
if (safe_poll(&pfd, 1, timeout) == 0) {
/* Timed out */
errno = EAGAIN;
return -1;
}
}
/* It is tempting to read more than one byte here,
* but it breaks pasting. Example: at shell prompt,
* user presses "c","a","t" and then pastes "\nline\n".
* When we were reading 3 bytes here, we were eating
* "li" too, and cat was getting wrong input.
*/
n = safe_read(fd, buffer, 1);
if (n <= 0)
return -1;
}
{
unsigned char c = buffer[0];
n--;
if (n)
memmove(buffer, buffer + 1, n);
/* Only ESC starts ESC sequences */
if (c != 27) {
buffer[-1] = n;
return c;
}
}
/* Loop through known ESC sequences */
seq = esccmds;
while (*seq != '\0') {
/* n - position in sequence we did not read yet */
int i = 0; /* position in sequence to compare */
/* Loop through chars in this sequence */
while (1) {
/* So far escape sequence matched up to [i-1] */
if (n <= i) {
/* Need more chars, read another one if it wouldn't block.
* Note that escape sequences come in as a unit,
* so if we block for long it's not really an escape sequence.
* Timeout is needed to reconnect escape sequences
* split up by transmission over a serial console. */
if (safe_poll(&pfd, 1, 50) == 0) {
/*
* This function is very close to the cache_read function of the caching
* library, which is used in the caching daemon. It reads cached data with the
* specified key from the cache entry with entry_name.
*/
int
__cached_read(struct cached_connection_ *connection, const char *entry_name,
const char *key, size_t key_size, char *data, size_t *data_size)
{
size_t name_size, result_size;
int error_code, rec_error_code;
int result;
assert(connection != NULL);
result = 0;
error_code = -1;
result = send_credentials(connection, CET_READ_REQUEST);
if (result != 0)
goto fin;
name_size = strlen(entry_name);
result = safe_write(connection, &name_size, sizeof(size_t));
if (result != 0)
goto fin;
result = safe_write(connection, &key_size, sizeof(size_t));
if (result != 0)
goto fin;
result = safe_write(connection, entry_name, name_size);
if (result != 0)
goto fin;
result = safe_write(connection, key, key_size);
if (result != 0)
goto fin;
result = safe_read(connection, &rec_error_code, sizeof(int));
if (result != 0)
goto fin;
if (rec_error_code != 0) {
error_code = rec_error_code;
goto fin;
}
result = safe_read(connection, &result_size, sizeof(size_t));
if (result != 0)
goto fin;
if (result_size > *data_size) {
*data_size = result_size;
error_code = -2;
goto fin;
}
result = safe_read(connection, data, result_size);
if (result != 0)
goto fin;
*data_size = result_size;
error_code = 0;
fin:
return (error_code);
}
static int
_handle_completion(int fd, stepd_step_rec_t *job, uid_t uid)
{
int rc = SLURM_SUCCESS;
int errnum = 0;
int first;
int last;
jobacctinfo_t *jobacct = NULL;
int step_rc;
char* buf;
int len;
Buf buffer;
int version; /* For future use */
bool lock_set = false;
debug("_handle_completion for job %u.%u",
job->jobid, job->stepid);
debug3(" uid = %d", uid);
if (!_slurm_authorized_user(uid)) {
debug("step completion message from uid %ld for job %u.%u ",
(long)uid, job->jobid, job->stepid);
rc = -1;
errnum = EPERM;
/* Send the return code and errno */
safe_write(fd, &rc, sizeof(int));
safe_write(fd, &errnum, sizeof(int));
return SLURM_SUCCESS;
}
safe_read(fd, &version, sizeof(int));
safe_read(fd, &first, sizeof(int));
safe_read(fd, &last, sizeof(int));
safe_read(fd, &step_rc, sizeof(int));
/*
* We must not use getinfo over a pipe with slurmd here
* Indeed, slurmstepd does a large use of setinfo over a pipe
* with slurmd and doing the reverse can result in a deadlock
* scenario with slurmd :
* slurmd(lockforread,write)/slurmstepd(write,lockforread)
* Do pack/unpack instead to be sure of independances of
* slurmd and slurmstepd
*/
safe_read(fd, &len, sizeof(int));
buf = xmalloc(len);
safe_read(fd, buf, len);
buffer = create_buf(buf, len);
jobacctinfo_unpack(&jobacct, SLURM_PROTOCOL_VERSION,
PROTOCOL_TYPE_SLURM, buffer, 1);
free_buf(buffer);
/*
* Record the completed nodes
*/
pthread_mutex_lock(&step_complete.lock);
lock_set = true;
if (! step_complete.wait_children) {
rc = -1;
errnum = ETIMEDOUT; /* not used anyway */
goto timeout;
}
/* SlurmUser or root can craft a launch without a valid credential
* ("srun --no-alloc ...") and no tree information can be built
* without the hostlist from the credential. */
if (step_complete.rank >= 0) {
#if 0
char bits_string[128];
debug2("Setting range %d (bit %d) through %d(bit %d)",
first, first-(step_complete.rank+1),
last, last-(step_complete.rank+1));
bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
debug2(" before bits: %s", bits_string);
#endif
bit_nset(step_complete.bits,
first - (step_complete.rank+1),
last - (step_complete.rank+1));
#if 0
bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
debug2(" after bits: %s", bits_string);
#endif
}
step_complete.step_rc = MAX(step_complete.step_rc, step_rc);
/************* acct stuff ********************/
jobacctinfo_aggregate(step_complete.jobacct, jobacct);
timeout:
jobacctinfo_destroy(jobacct);
/*********************************************/
/* Send the return code and errno, we do this within the locked
* region to ensure that the stepd doesn't exit before we can
* perform this send. */
safe_write(fd, &rc, sizeof(int));
safe_write(fd, &errnum, sizeof(int));
pthread_cond_signal(&step_complete.cond);
pthread_mutex_unlock(&step_complete.lock);
return SLURM_SUCCESS;
//.........这里部分代码省略.........
开发者ID:birc-aeh,项目名称:slurm,代码行数:101,代码来源:req.c
示例11: _handle_signal_container
static int
_handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid)
{
int rc = SLURM_SUCCESS;
int errnum = 0;
int sig;
static int msg_sent = 0;
char *ptr = NULL;
int target_node_id = 0;
stepd_step_task_info_t *task;
uint32_t i;
safe_read(fd, &sig, sizeof(int));
debug("_handle_signal_container for step=%u.%u uid=%d signal=%d",
job->jobid, job->stepid, (int) uid, sig);
if ((uid != job->uid) && !_slurm_authorized_user(uid)) {
error("signal container req from uid %ld for step=%u.%u "
"owned by uid %ld",
(long)uid, job->jobid, job->stepid, (long)job->uid);
rc = -1;
errnum = EPERM;
goto done;
}
/*
* Sanity checks
*/
if (job->cont_id == 0) {
debug ("step %u.%u invalid container [cont_id:%"PRIu64"]",
job->jobid, job->stepid, job->cont_id);
rc = -1;
errnum = ESLURMD_JOB_NOTRUNNING;
goto done;
}
if ((sig == SIGTERM) || (sig == SIGKILL)) {
/* cycle thru the tasks and mark those that have not
* called abort and/or terminated as killed_by_cmd
*/
for (i = 0; i < job->node_tasks; i++) {
if (NULL == (task = job->task[i])) {
continue;
}
if (task->aborted || task->exited) {
continue;
}
/* mark that this task is going to be killed by
* cmd so we ignore its exit status - otherwise,
* we will probably report the final exit status
* as SIGKILL
*/
task->killed_by_cmd = true;
}
}
ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID");
if (ptr)
target_node_id = atoi(ptr);
if ((job->nodeid == target_node_id) && (msg_sent == 0) &&
(job->state < SLURMSTEPD_STEP_ENDING)) {
time_t now = time(NULL);
char entity[24], time_str[24];
if (job->stepid == SLURM_BATCH_SCRIPT) {
snprintf(entity, sizeof(entity), "JOB %u", job->jobid);
} else {
snprintf(entity, sizeof(entity), "STEP %u.%u",
job->jobid, job->stepid);
}
slurm_make_time_str(&now, time_str, sizeof(time_str));
/* Not really errors,
* but we want messages displayed by default */
if (sig == SIG_TIME_LIMIT) {
error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***",
entity, time_str);
msg_sent = 1;
} else if (sig == SIG_PREEMPTED) {
error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***",
entity, time_str);
msg_sent = 1;
} else if (sig == SIG_NODE_FAIL) {
error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***",
entity, time_str);
msg_sent = 1;
} else if (sig == SIG_FAILURE) {
error("*** %s FAILED (non-zero exit code or other "
"failure mode) ***", entity);
msg_sent = 1;
} else if ((sig == SIGTERM) || (sig == SIGKILL)) {
error("*** %s CANCELLED AT %s ***", entity, time_str);
msg_sent = 1;
}
}
if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) ||
(sig == SIG_PREEMPTED) || (sig == SIG_FAILURE))
goto done;
if (sig == SIG_ABORT) {
sig = SIGKILL;
job->aborted = true;
//.........这里部分代码省略.........
开发者ID:birc-aeh,项目名称:slurm,代码行数:101,代码来源:req.c
示例12: _handle_resume
static int
_handle_resume(int fd, stepd_step_rec_t *job, uid_t uid)
{
int rc = SLURM_SUCCESS;
int errnum = 0;
uint16_t job_core_spec = (uint16_t) NO_VAL;
safe_read(fd, &job_core_spec, sizeof(uint16_t));
debug("_handle_resume for step:%u.%u uid:%ld core_spec:%u",
job->jobid, job->stepid, (long)uid, job_core_spec);
if (!_slurm_authorized_user(uid)) {
debug("job step resume request from uid %ld for job %u.%u ",
(long)uid, job->jobid, job->stepid);
rc = -1;
errnum = EPERM;
goto done;
}
if (job->cont_id == 0) {
debug ("step %u.%u invalid container [cont_id:%"PRIu64"]",
job->jobid, job->stepid, job->cont_id);
rc = -1;
errnum = ESLURMD_JOB_NOTRUNNING;
goto done;
}
acct_gather_resume_poll();
/*
* Signal the container
*/
pthread_mutex_lock(&suspend_mutex);
if (!suspended) {
rc = -1;
errnum = ESLURMD_STEP_NOTSUSPENDED;
pthread_mutex_unlock(&suspend_mutex);
goto done;
} else {
if (!job->batch && switch_g_job_step_pre_resume(job))
error("switch_g_job_step_pre_resume: %m");
if (!job->batch && core_spec_g_resume(job->cont_id,
job_core_spec))
error("core_spec_g_resume: %m");
if (proctrack_g_signal(job->cont_id, SIGCONT) < 0) {
verbose("Error resuming %u.%u: %m",
job->jobid, job->stepid);
} else {
verbose("Resumed %u.%u", job->jobid, job->stepid);
}
suspended = false;
}
if (!job->batch && switch_g_job_step_post_resume(job))
error("switch_g_job_step_post_resume: %m");
/* set the cpu frequencies if cpu_freq option used */
if (job->cpu_freq != NO_VAL)
cpu_freq_set(job);
pthread_mutex_unlock(&suspend_mutex);
done:
/* Send the return code and errno */
safe_write(fd, &rc, sizeof(int));
safe_write(fd, &errnum, sizeof(int));
return SLURM_SUCCESS;
rwfail:
return SLURM_FAILURE;
}
开发者ID:birc-aeh,项目名称:slurm,代码行数:68,代码来源:req.c
示例13: _handle_suspend
static int
_handle_suspend(int fd, stepd_step_rec_t *job, uid_t uid)
{
static int launch_poe = -1;
int rc = SLURM_SUCCESS;
int errnum = 0;
uint16_t job_core_spec = (uint16_t) NO_VAL;
safe_read(fd, &job_core_spec, sizeof(uint16_t));
debug("_handle_suspend for step:%u.%u uid:%ld core_spec:%u",
job->jobid, job->stepid, (long)uid, job_core_spec);
if (!_slurm_authorized_user(uid)) {
debug("job step suspend request from uid %ld for job %u.%u ",
(long)uid, job->jobid, job->stepid);
rc = -1;
errnum = EPERM;
goto done;
}
if (job->cont_id == 0) {
debug ("step %u.%u invalid container [cont_id:%"PRIu64"]",
job->jobid, job->stepid, job->cont_id);
rc = -1;
errnum = ESLURMD_JOB_NOTRUNNING;
goto done;
}
acct_gather_suspend_poll();
if (launch_poe == -1) {
char *launch_type = slurm_get_launch_type();
if (!strcmp(launch_type, "launch/poe"))
launch_poe = 1;
else
launch_poe = 0;
xfree(launch_type);
}
/*
* Signal the container
*/
pthread_mutex_lock(&suspend_mutex);
if (suspended) {
rc = -1;
errnum = ESLURMD_STEP_SUSPENDED;
pthread_mutex_unlock(&suspend_mutex);
goto done;
} else {
if (!job->batch && switch_g_job_step_pre_suspend(job))
error("switch_g_job_step_pre_suspend: %m");
/* SIGTSTP is sent first to let MPI daemons stop their tasks,
* then wait 2 seconds, then send SIGSTOP to the spawned
* process's container to stop everything else.
*
* In some cases, 1 second has proven insufficient. Longer
* delays may help insure that all MPI tasks have been stopped
* (that depends upon the MPI implementaiton used), but will
* also permit longer time periods when more than one job can
* be running on each resource (not good). */
if (launch_poe == 0) {
/* IBM MPI seens to periodically hang upon receipt
* of SIGTSTP. */
if (proctrack_g_signal(job->cont_id, SIGTSTP) < 0) {
verbose("Error suspending %u.%u (SIGTSTP): %m",
job->jobid, job->stepid);
} else
sleep(2);
}
if (proctrack_g_signal(job->cont_id, SIGSTOP) < 0) {
verbose("Error suspending %u.%u (SIGSTOP): %m",
job->jobid, job->stepid);
} else {
verbose("Suspended %u.%u", job->jobid, job->stepid);
}
suspended = true;
}
if (!job->batch && switch_g_job_step_post_suspend(job))
error("switch_g_job_step_post_suspend: %m");
if (!job->batch && core_spec_g_suspend(job->cont_id, job_core_spec))
error("core_spec_g_suspend: %m");
pthread_mutex_unlock(&suspend_mutex);
done:
/* Send the return code and errno */
safe_write(fd, &rc, sizeof(int));
safe_write(fd, &errnum, sizeof(int));
return SLURM_SUCCESS;
rwfail:
return SLURM_FAILURE;
}
开发者ID:birc-aeh,项目名称:slurm,代码行数:94,代码来源:req.c
示例14: jobacctinfo_getinfo
extern int jobacctinfo_getinfo(
jobacctinfo_t *jobacct, enum jobacct_data_type type, void *data,
uint16_t protocol_version)
{
int rc = SLURM_SUCCESS;
int *fd = (int *)data;
uint32_t *uint32 = (uint32_t *) data;
uint64_t *uint64 = (uint64_t *) data;
double *dub = (double *) data;
jobacct_id_t *jobacct_id = (jobacct_id_t *) data;
struct rusage *rusage = (struct rusage *)data;
struct jobacctinfo *send = (struct jobacctinfo *) data;
if (!plugin_polling)
return SLURM_SUCCESS;
/* jobacct needs to be allocated before this is called. */
xassert(jobacct);
switch (type) {
case JOBACCT_DATA_TOTAL:
memcpy(send, jobacct, sizeof(struct jobacctinfo));
break;
case JOBACCT_DATA_PIPE:
if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
char* buf;
int len;
Buf buffer;
safe_read(*fd, &len, sizeof(int));
buf = xmalloc(len);
safe_read(*fd, buf, len);
buffer = create_buf(buf, len);
jobacctinfo_unpack(&jobacct, protocol_version,
PROTOCOL_TYPE_SLURM, buffer, 0);
free_buf(buffer);
}
break;
case JOBACCT_DATA_RUSAGE:
memset(rusage, 0, sizeof(struct rusage));
rusage->ru_utime.tv_sec = jobacct->user_cpu_sec;
rusage->ru_utime.tv_usec = jobacct->user_cpu_usec;
rusage->ru_stime.tv_sec = jobacct->sys_cpu_sec;
rusage->ru_stime.tv_usec = jobacct->sys_cpu_usec;
break;
case JOBACCT_DATA_MAX_RSS:
*uint64 = jobacct->max_rss;
break;
case JOBACCT_DATA_MAX_RSS_ID:
*jobacct_id = jobacct->max_rss_id;
break;
case JOBACCT_DATA_TOT_RSS:
*uint64 = jobacct->tot_rss;
break;
case JOBACCT_DATA_MAX_VSIZE:
*uint64 = jobacct->max_vsize;
break;
case JOBACCT_DATA_MAX_VSIZE_ID:
*jobacct_id = jobacct->max_vsize_id;
break;
case JOBACCT_DATA_TOT_VSIZE:
*uint64 = jobacct->tot_vsize;
break;
case JOBACCT_DATA_MAX_PAGES:
*uint64 = jobacct->max_pages;
break;
case JOBACCT_DATA_MAX_PAGES_ID:
*jobacct_id = jobacct->max_pages_id;
break;
case JOBACCT_DATA_TOT_PAGES:
*uint64 = jobacct->tot_pages;
break;
case JOBACCT_DATA_MIN_CPU:
*uint32 = jobacct->min_cpu;
break;
case JOBACCT_DATA_MIN_CPU_ID:
*jobacct_id = jobacct->min_cpu_id;
break;
case JOBACCT_DATA_TOT_CPU:
*dub = jobacct->tot_cpu;
break;
case JOBACCT_DATA_ACT_CPUFREQ:
*uint32 = jobacct->act_cpufreq;
break;
case JOBACCT_DATA_CONSUMED_ENERGY:
*uint64 = jobacct->energy.consumed_energy;
break;
case JOBACCT_DATA_MAX_DISK_READ:
*dub = jobacct->max_disk_read;
break;
case JOBACCT_DATA_MAX_DISK_READ_ID:
*jobacct_id = jobacct->max_disk_read_id;
break;
case JOBACCT_DATA_TOT_DISK_READ:
*dub = jobacct->tot_disk_read;
break;
case JOBACCT_DATA_MAX_DISK_WRITE:
*dub = jobacct->max_disk_write;
break;
//.........这里部分代码省略.........
请发表评论