From 22701b2db48f716ac7004282b4d17a24857a0bd3 Mon Sep 17 00:00:00 2001 From: Dennis van Dok Date: Thu, 12 May 2016 16:28:01 +0200 Subject: [PATCH] The preq variable passed to svr_strtjob2 is the wrong request. It should have been the AsyrunJob request but it is now the CopyFiles request to stage the files to the mom; and that request has been done and dealt with. Usually the ack later on to this request will not go through, as the file descriptor to which it was associated is already closed, but in some cases the fd has been reopened to start more jobs on the same worker node. In that case the ack is misinterpreted as a Connect request by the mom and the job commit will not go through, leaving jobs in a strange state. The fix in this commit mitigates the problem by preventing the ack from being sent. --- src/server/req_runjob.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/server/req_runjob.c b/src/server/req_runjob.c index e0a6a63302..2d3b4c08d7 100644 --- a/src/server/req_runjob.c +++ b/src/server/req_runjob.c @@ -738,7 +738,11 @@ void post_stagein( else { /* continue to start job running */ - + /* this is actually completely the wrong preq to pass (i.e. type==PBS_BATCH_CopyFiles) + as it has already been handled completely by the mom. We should pass the original + AsyrunJob request, but we no longer have it. Any acks or denies are potential + hazards, so make sure we set noreply. */ + preq->rq_noreply = 1; svr_strtjob2(&pjob, preq); /* svr_strjob2 would call finish_sendmom which would free preq @@ -1542,8 +1546,18 @@ void finish_sendmom( pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART; if (preq != NULL) - reply_ack(preq); - + { + if (preq->rq_type == PBS_BATCH_CopyFiles) /* via post_stagein! bad! */ + { + sprintf(log_buf, "about to ack req type=%d, sock=%d, %s@%s, noreply=%d, but I'm not going to", + preq->rq_type,preq->rq_conn,preq->rq_user,preq->rq_host,preq->rq_noreply); + log_event(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,job_id,log_buf); + } + else + { + reply_ack(preq); + } + } /* record start time for accounting */ pjob->ji_qs.ji_stime = time_now;