bionic (3) slurm_job_step_launch_t_init.3.gz

Provided by: libslurm-dev_17.11.2-1build1_amd64 bug

NAME

       slurm_step_launch_params_t_init,             slurm_step_launch,             slurm_step_launch_fwd_signal,
       slurm_step_launch_wait_start, slurm_step_launch_wait_finish, slurm_step_launch_abort  -  Slurm  job  step
       launch functions

SYNTAX

       #include <slurm/slurm.h>

       void slurm_step_launch_params_t_init (
            slurm_step_launch_params_t *launch_req
       );

       int slurm_step_launch (
            slurm_step_ctx ctx,
            const slurm_step_launch_params_t *launch_req,
            const slurm_step_launch_callbacks_t callbacks
       );

       void slurm_step_launch_fwd_signal (
            slurm_step_ctx ctx,
            int signo
       );

       int slurm_step_launch_wait_start (
            slurm_step_ctx ctx
       );

       void slurm_step_launch_wait_finish (
            slurm_step_ctx ctx
       );

       void slurm_step_launch_abort {
            slurm_step_ctx ctx
       );

ARGUMENTS

       callbacks
              Identify functions to be called when various events occur.

       ctx    Job  step  context.  Created  by  slurm_step_ctx_create,  used  in  subsequent function calls, and
              destroyed by slurm_step_ctx_destroy.

       launch_req
              Pointer to a structure allocated by the user containing specifications  of  the  job  step  to  be
              launched.

DESCRIPTION

       slurm_step_launch_params_t_init  initialize  a  user-allocated  slurm_step_launch_params_t structure with
       default values.  default values.  This function will NOT allocate any new memory.

       slurm_step_launch Launch a parallel job step.

       slurm_step_launch_fwd_signal Forward a signal to all those nodes with running tasks.

       slurm_step_launch_wait_start Block until all tasks have started.

       slurm_step_launch_wait_finish Block until all tasks have finished (or failed to start altogether).

       slurm_step_launch_abort Abort an in-progress launch, or terminate the fully launched  job  step.  Can  be
       called from a signal handler.

IO Redirection

       Use  the  local_fds  entry  in   slurm_step_launch_params_t  to  specify  file descriptors to be used for
       standard input, output and error. Any local_fds not specified will result in the launched tasks using the
       calling process's standard input, output and error.  Threads created by slurm_step_launch will completely
       handle copying data between the remote processes and the specified local file descriptors.

       Use the substructure in slurm_step_io_fds_t to restrict the redirection of I/O to a specific node or task
       ID. For example, to redirect standard output only from task 0, set

       params.local_fs.out.taskid=0;

       Use  the  remote_*_filename fields in slurm_step_launch_params_t to have launched tasks read and/or write
       directly to local files rather than transferring data over the network to  the  calling  process.   These
       strings  support  many  of  the same format options as the srun command. Any remote_*_filename fields set
       will supersede the corresponding local_fds entries. For example, the following code will direct each task
       to  write  standard  output  and  standard  error  to local files with names containing the task ID (e.g.
       "/home/bob/test_output/run1.out.0" and "/home/bob/test_output/run.1.err.0" for task 0).

       params.remote_output_filename = "/home/bob/test_output/run1.out.%t"
       params.remote_error_filename  = "/home/bob/test_output/run1.err.%t"

RETURN VALUE

       slurm_step_launch  and  slurm_step_launch_wait_start  will  return  SLURM_SUCCESS  when  all  tasks  have
       successfully started, or SLURM_ERROR if the job step is aborted during launch.

ERRORS

       EINVAL Invalid argument

       SLURM_PROTOCOL_VERSION_ERROR Protocol version has changed, re-link your code.

       ESLURM_INVALID_JOB_ID the requested job id does not exist.

       ESLURM_ALREADY_DONE the specified job has already completed and can not be modified.

       ESLURM_ACCESS_DENIED  the  requesting  user  lacks authorization for the requested action (e.g. trying to
       delete or modify another user's job).

       ESLURM_INTERCONNECT_FAILURE failed to configure the node interconnect.

       ESLURM_BAD_DIST task distribution specification is invalid.

       SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT Timeout in communicating with Slurm controller.

EXAMPLE

       /*
        * To compile:
        * gcc test.c -o test -g -pthread -lslurm
        *
        * Or if Slurm is not in your default search paths:
        * gcc test.c -o test -g -pthread -I{$SLURM_DIR}/include \
        *     -Wl,--rpath={$SLURM_DIR}/lib -L{$SLURM_DIR}/lib -lslurm
        */
       #include <stdio.h>
       #include <stdlib.h>
       #include <string.h>
       #include <slurm/slurm.h>
       #include <slurm/slurm_errno.h>

       static void _task_start(launch_tasks_response_msg_t *msg)
       {
            printf("%d tasks started on node %s\n",
                 msg->count_of_pids, msg->node_name);
       }

       static void _task_finish(task_exit_msg_t *msg)
       {
            printf("%d tasks finished\n", msg->num_tasks);
       }

       int main (int argc, char *argv[])
       {
            slurm_step_ctx_params_t step_params;
            slurm_step_ctx step_ctx;
            slurm_step_launch_params_t params;
            slurm_step_launch_callbacks_t callbacks;
            uint32_t job_id, step_id;

            slurm_step_ctx_params_t_init(&step_params);
            step_params.node_count = 1;
            step_params.task_count = 4;
            step_params.overcommit = true;

            step_ctx = slurm_step_ctx_create(&step_params);
            if (step_ctx == NULL) {
                 slurm_perror("slurm_step_ctx_create");
                 exit(1);
            }
            slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id);
            slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id);
            printf("Ready to start job %u step %u\n", job_id, step_id);

            slurm_step_launch_params_t_init(&params);
            params.argc = argc - 1;
            params.argv = argv + 1;
            callbacks.task_start = _task_start;
            callbacks.task_finish = _task_finish;
            if (slurm_step_launch(step_ctx, NULL, &params, &callbacks)
                      != SLURM_SUCCESS) {
                 slurm_perror("slurm_step_launch");
                 exit(1);
            }
            printf("Sent step launch RPC\n");

            if (slurm_step_launch_wait_start(step_ctx) != SLURM_SUCCESS) {
                 fprintf(stderr, "job step was aborted during launch\n");
            } else {
                 printf("All tasks have started\n");
            }

            slurm_step_launch_wait_finish(step_ctx);
            printf("All tasks have finished\n");

            slurm_step_ctx_destroy(step_ctx);
            exit(0);
       }

NOTE

       These functions are included in the libslurm library, which must be linked to your process for use  (e.g.
       "cc -lslurm myprog.c").

COPYING

       Copyright  (C)  2006-2007  The  Regents  of  the  University  of California.  Copyright (C) 2008 Lawrence
       Livermore National Security.  Produced  at  Lawrence  Livermore  National  Laboratory  (cf,  DISCLAIMER).
       CODE-OCEC-09-009. All rights reserved.

       This    file    is    part    of    Slurm    a   resource   management   program.    For   details,   see
       <https://slurm.schedmd.com/>.

       Slurm is free software; you can redistribute it and/or modify it under  the  terms  of  the  GNU  General
       Public License as published by the Free Software Foundation; either version 2 of the License, or (at your
       option) any later version.

       Slurm is distributed in the hope that it will be useful, but  WITHOUT  ANY  WARRANTY;  without  even  the
       implied  warranty  of  MERCHANTABILITY  or  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
       License for more details.

SEE ALSO

       slurm_step_ctx_create(3),      slurm_step_ctx_destroy(3),      slurm_get_errno(3),       slurm_perror(3),
       slurm_strerror(3), salloc(1), srun(1)