.TH "Slurm API" "3" "Slurm job step launch functions" "April 2015" "Slurm job step launch functions" .SH "NAME" slurm_step_launch_params_t_init, slurm_step_launch, slurm_step_launch_fwd_signal, slurm_step_launch_wait_start, slurm_step_launch_wait_finish, slurm_step_launch_abort \- Slurm job step launch functions .SH "SYNTAX" .LP #include .LP .LP void \fBslurm_step_launch_params_t_init\fR ( .br slurm_step_launch_params_t *\fIlaunch_req\fP .br ); .LP int \fBslurm_step_launch\fR ( .br slurm_step_ctx \fIctx\fP, .br const slurm_step_launch_params_t *\fIlaunch_req\fP, .br const slurm_step_launch_callbacks_t \fIcallbacks\fP .br ); .LP void \fBslurm_step_launch_fwd_signal\fR ( .br slurm_step_ctx \fIctx\fP, .br int \fIsigno\fP .br ); .LP int \fBslurm_step_launch_wait_start\fR ( .br slurm_step_ctx \fIctx\fP .br ); .LP void \fBslurm_step_launch_wait_finish\fR ( .br slurm_step_ctx \fIctx\fP .br ); .LP void \fBslurm_step_launch_abort\fR { .br slurm_step_ctx \fIctx\fP .br ); .SH "ARGUMENTS" .LP .TP \fIcallbacks\fP Identify functions to be called when various events occur. .TP \fIctx\fP Job step context. Created by \fBslurm_step_ctx_create\fR, used in subsequent function calls, and destroyed by \fBslurm_step_ctx_destroy\fR. .TP \fIlaunch_req\fP Pointer to a structure allocated by the user containing specifications of the job step to be launched. .SH "DESCRIPTION" .LP \fBslurm_step_launch_params_t_init\fR initialize a user-allocated slurm_step_launch_params_t structure with default values. default values. This function will NOT allocate any new memory. .LP \fBslurm_step_launch\fR Launch a parallel job step. .LP \fBslurm_step_launch_fwd_signal\fR Forward a signal to all those nodes with running tasks. .LP \fBslurm_step_launch_wait_start\fR Block until all tasks have started. .LP \fBslurm_step_launch_wait_finish\fR Block until all tasks have finished (or failed to start altogether). .LP \fBslurm_step_launch_abort\fR Abort an in-progress launch, or terminate the fully launched job step. Can be called from a signal handler. .SH "IO Redirection" .LP Use the \fIlocal_fds\fR entry in \fIslurm_step_launch_params_t\fR to specify file descriptors to be used for standard input, output and error. Any \fIlocal_fds\fR not specified will result in the launched tasks using the calling process's standard input, output and error. Threads created by \fBslurm_step_launch\fR will completely handle copying data between the remote processes and the specified local file descriptors. .LP Use the substructure in \fIslurm_step_io_fds_t\fR to restrict the redirection of I/O to a specific node or task ID. For example, to redirect standard output only from task 0, set .LP .nf params.local_fs.out.taskid=0; .fi .LP Use the \fIremote_*_filename\fR fields in \fIslurm_step_launch_params_t\fR to have launched tasks read and/or write directly to local files rather than transferring data over the network to the calling process. These strings support many of the same format options as the \fBsrun\fR command. Any \fIremote_*_filename\fR fields set will supersede the corresponding \fIlocal_fds\fR entries. For example, the following code will direct each task to write standard output and standard error to local files with names containing the task ID (e.g. "/home/bob/test_output/run1.out.0" and "/home/bob/test_output/run.1.err.0" for task 0). .LP .nf params.remote_output_filename = "/home/bob/test_output/run1.out.%t" params.remote_error_filename = "/home/bob/test_output/run1.err.%t" .fi .SH "RETURN VALUE" .LP \fBslurm_step_launch\fR and \fBslurm_step_launch_wait_start\fR will return SLURM_SUCCESS when all tasks have successfully started, or SLURM_ERROR if the job step is aborted during launch. .SH "ERRORS" .LP \fBEINVAL\fR Invalid argument .LP \fBSLURM_PROTOCOL_VERSION_ERROR\fR Protocol version has changed, re\-link your code. .LP \fBESLURM_INVALID_JOB_ID\fR the requested job id does not exist. .LP \fBESLURM_ALREADY_DONE\fR the specified job has already completed and can not be modified. .LP \fBESLURM_ACCESS_DENIED\fR the requesting user lacks authorization for the requested action (e.g. trying to delete or modify another user's job). .LP \fBESLURM_INTERCONNECT_FAILURE\fR failed to configure the node interconnect. .LP \fBESLURM_BAD_DIST\fR task distribution specification is invalid. .LP \fBSLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT\fR Timeout in communicating with Slurm controller. .SH "EXAMPLE .LP .nf /* * To compile: * gcc test.c \-o test \-g \-pthread \-lslurm * * Or if Slurm is not in your default search paths: * gcc test.c \-o test \-g \-pthread \-I{$SLURM_DIR}/include \\ * \-Wl,\-\-rpath={$SLURM_DIR}/lib \-L{$SLURM_DIR}/lib \-lslurm */ #include #include #include #include #include static void _task_start(launch_tasks_response_msg_t *msg) { printf("%d tasks started on node %s\\n", msg->count_of_pids, msg->node_name); } static void _task_finish(task_exit_msg_t *msg) { printf("%d tasks finished\\n", msg->num_tasks); } int main (int argc, char *argv[]) { slurm_step_ctx_params_t step_params; slurm_step_ctx step_ctx; slurm_step_launch_params_t params; slurm_step_launch_callbacks_t callbacks; uint32_t job_id, step_id; slurm_step_ctx_params_t_init(&step_params); step_params.node_count = 1; step_params.task_count = 4; step_params.overcommit = true; step_ctx = slurm_step_ctx_create(&step_params); if (step_ctx == NULL) { slurm_perror("slurm_step_ctx_create"); exit(1); } slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id); slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id); printf("Ready to start job %u step %u\\n", job_id, step_id); slurm_step_launch_params_t_init(¶ms); params.argc = argc \- 1; params.argv = argv + 1; callbacks.task_start = _task_start; callbacks.task_finish = _task_finish; if (slurm_step_launch(step_ctx, NULL, ¶ms, &callbacks) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch"); exit(1); } printf("Sent step launch RPC\\n"); if (slurm_step_launch_wait_start(step_ctx) != SLURM_SUCCESS) { fprintf(stderr, "job step was aborted during launch\\n"); } else { printf("All tasks have started\\n"); } slurm_step_launch_wait_finish(step_ctx); printf("All tasks have finished\\n"); slurm_step_ctx_destroy(step_ctx); exit(0); } .fi .SH "NOTE" These functions are included in the libslurm library, which must be linked to your process for use (e.g. "cc \-lslurm myprog.c"). .SH "COPYING" Copyright (C) 2006-2007 The Regents of the University of California. Copyright (C) 2008 Lawrence Livermore National Security. Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). CODE\-OCEC\-09\-009. All rights reserved. .LP This file is part of Slurm a resource management program. For details, see . .LP Slurm is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. .LP Slurm is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. .SH "SEE ALSO" .LP \fBslurm_step_ctx_create\fR(3), \fBslurm_step_ctx_destroy\fR(3), \fBslurm_get_errno\fR(3), \fBslurm_perror\fR(3), \fBslurm_strerror\fR(3), \fBsalloc\fR(1), \fBsrun\fR(1)