
    Vh)9                     n   d Z ddlmZmZ ddlmZmZ ddlZddlm	Z	 ddl
Z
ddlZddlZddlZddlZddlmZ ddlZddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZm Z   ej!        e"          Z# G d d          Z$ G d d          Z%de&fdZ'e G d d                      Z( G d d          Z)dS )z)Scheduling and job monitoring utilities.
    )contextmanager	ExitStack)	dataclassfieldN)Path)SlurmJob   )git_save)SlurmConfigSubmitRules)get_distrib_spec)DecoratedMain)try_load)XP_get_sigc                   :    e Zd Zdedej        e         fdZd ZdS )_SubmitItTargetmainargvc                     |                     |          | _        t                      }t          |j                  t
          j        d<   |t          j        dd <    |             d S )NRANKr	   )	get_xpxpr   strrankosenvironsysr   )selfr   r   specs       O/var/www/html/movieo_spanner_bot/venv/lib/python3.11/site-packages/dora/shep.py__call__z_SubmitItTarget.__call__#   sT    ++d##!! !^^
6    c                     t                      j        dk    r<| j        j                                        r| j        j                                         t          j        j        | g|R i |S )Nr   )	r   r   r   rendezvous_fileexistsunlinksubmitithelpersDelayedSubmission)r   argskwargss      r!   
checkpointz_SubmitItTarget.checkpoint,   sj    "a''w&--// 1'..0001$HHHHHHHr#   N)	__name__
__module____qualname__r   tpSequencer   r"   r-    r#   r!   r   r   "   sQ        ] "+c2B    I I I I Ir#   r   c                   f    e Zd ZdZdefdZedefd            ZddZ	ddZ
ed	             Zd
 ZdS )Sheepz[
    A Sheep is a specific run for a given XP. Sheeps are managed
    by the Shepherd.
    r   c                     || _         d | _        d | _        | j                                        rCt          | j                  }t          |t                    r|\  | _        | _        d S || _        d S d S N)r   job_other_jobs	_job_filer&   r   
isinstancetuple)r   r   contents      r!   __init__zSheep.__init__9   sy    37DH>  "" 	#t~..G'5)) #-4*$***"	# 	#r#   returnc                 H    | j         j        | j         j        j        j        z  S r7   )r   folderdorashepjob_filer   s    r!   r:   zSheep._job_fileE   s    w~ 1 :::r#   standardc                     | j         dS | j         j                            | j         j        |          }|dk    r'| j        r t          d | j        D                       rd}|                    d          rdS |S )z1Return the current state of the `Sheep`.
        NUNKNOWNc              3   ,   K   | ]}|j         d k    V  dS )rH   N)state.0r8   s     r!   	<genexpr>zSheep.state.<locals>.<genexpr>P   s)      FFc39	)FFFFFFr#   MISSING	CANCELLED)r8   watcher	get_statejob_idr9   any
startswith)r   moderJ   s      r!   rJ   zSheep.stateI   s     84 **48?DAAI$"2FFT5EFFFFF " "K(( 	;r#   c                 h    | j         dS | j         j                            | j         j        |          S )zDReturn True if the job is no longer running on the cluster.
        NT)r8   rP   is_donerR   )r   rU   s     r!   rW   zSheep.is_doneY   s0     84x''>>>r#   c                 L    | j         | j        j        | j         j         dz  S dS )z)Return the path to the main log.
        Nz
_0_log.out)r8   r   r(   rR   rE   s    r!   logz	Sheep.log`   s.     87#&D&D&DDDtr#   c                     d| j         j         d|                                  d}| j        |d| j        j         dz  }|d| j         j         dz  }|S )NzSheep(z, state=z, zsid=zargv=))r   sigrJ   r8   rR   r   )r   outs     r!   __repr__zSheep.__repr__h   sg    <tw{<<DJJLL<<<8-$(/----C&tw|&&&&
r#   N)rF   )r.   r/   r0   __doc__r   r>   propertyr   r:   rJ   rW   rY   r^   r3   r#   r!   r5   r5   4   s         
#2 
# 
# 
# 
# ;4 ; ; ; X;    ? ? ? ?   X    r#   r5   xc                     dS )z7No logging logging function, passed to `Shepherd`.
    Nr3   )ra   s    r!   no_logrc   q   s	     	Dr#   c                   R    e Zd ZU eed<    ee          Zej	        e
         ed<   dS )	_JobArrayslurm_config)default_factorysheepsN)r.   r/   r0   r   __annotations__r   listrh   r1   Listr5   r3   r#   r!   re   re   w   sA         "U4888FBGEN88888r#   re   c                      e Zd ZdZefdedej        egdf         fdZ	dej
        e         defdZd	edej        e         fd
Zdedej        e         fdZd Zedefd            ZdededefdZdej        fdZd Zedefd            Zedefd            Zedefd            Zdej        e         fdZ dedededej!        fdZ"d Z#edefd            Z$d e%fd!Z&dS )"Shepherdz
    Takes care of the little jobs.

    Args:
        main (DecoratedMain): main function decorated by Dora.
        log (callable): log function, if provided should take a single string
            argument.
    r   rY   Nc                 *   || _         | j                            dd           | j                            dd           | j                            dd           || _        d| _        d | _        g | _        g | _	        | 
                                 d S )NT)exist_okparentsF)r   _by_idmkdir_orphans_arraysrY   _in_job_array_existing_git_clone
_to_cancel
_to_submit_check_orphans)r   r   rY   s      r!   r>   zShepherd.__init__   s    	4666T4888D$777#(6: 68.0r#   r   r?   c                     t          |t                    rJ | j                            |          }t	          |          S )z
        Given a list of of arguments, return the matching `Sheep`,
        which will contain both information on the `dora.xp.XP`, and on
        the latest job associated with that XP.
        )r;   r   r   r   r5   )r   r   r   s      r!   get_sheep_from_argvzShepherd.get_sheep_from_argv   s;     dC(((((Yd##Ryyr#   r\   c                 T    | j                             |          }t          |          S )zj
        Returns a `Sheep` given the XP signature, if any exists, otherwise
        returns None.
        )r   get_xp_from_sigr5   )r   r\   r   s      r!   get_sheep_from_sigzShepherd.get_sheep_from_sig   s%    
 Y&&s++Ryyr#   rR   c                     | j         |z  }|                                rB|                                j        }| j                            |          }t          |          S dS )zu
        Returns the `Sheep` associated with the given `job_id`. If no sheep
        is found, returns None.
        N)rq   
is_symlinkresolvenamer   r}   r5   )r   rR   linkr\   r   s        r!   get_sheep_from_job_idzShepherd.get_sheep_from_job_id   sX    
 {V#?? 	,,..%C**3//B99tr#   c                 B    t           j                                         dS )zB
        Force an update of all job states with submitit.
        N)r   rP   updaterE   s    r!   r   zShepherd.update   s     	!!!!!r#   rf   c              #      K   | j         rJ | j                            t          |                     d| _         	 dV  d| _         dS # d| _         w xY w)z*Context manager to launch XP in job array.TNF)ru   rx   appendre   )r   rf   s     r!   	job_arrayzShepherd.job_array   sm       %%%%y66777!	'EEE!&DD&&&&s   A 	Asheeprulesc                    |j         |                                }|dk    r6|j        r.t                              d|j         j                    d|_         n|dv r7t                              d|j         j         d           |j        rd|_         nR|j        rKt                              d|j         j         d|            |                     |j                    d|_         |j         m| j	        s'| j
                            t          |                     || j
        d	         j        k    sJ | j
        d	         j                            |           dS dS )
z
        Decides whether to schedule a new job for the given sheep, based on the rules
        given in `rules`.
        Jobs are actually only scheduled once the `commit()` method is called.
        N	COMPLETEDz"Ignoring previously completed job )FAILEDrO   OUT_OF_MEMORYTIMEOUTrN   	NODE_FAILzPrevious job z failed or was canceledzCancelling previous job z with status )r8   rJ   replace_doneloggerdebugrR   retryreplacecancel_lazyru   rx   r   re   rf   rh   )r   r   rf   r   rJ   s        r!   maybe_submit_lazyzShepherd.maybe_submit_lazy   sg    9 KKMME##% %LL!XeiFV!X!XYYY $EI ( ( (VUY-=VVVWWW; % $EI= %LL!bEI<L!b!b[`!b!bccc$$UY/// $EI9% @&&y'>'>???4?2#6#CCCCCOB&--e44444	 r#   r8   c                 :    | j                             |           dS )z]
        Cancel a job. The job is actually cancelled only when `commit()` is called.
        N)rw   r   )r   r8   s     r!   r   zShepherd.cancel_lazy   s      	s#####r#   c                     | j         r!|                     | j                    g | _         d| _        | j        r8| j                            d          }|                     |           | j        6dS dS )zu
        Commit all changes registered so far with either `maybe_submit_lazy()`
        and `cancel_lazy()`.
        Nr   )rw   _cancelrv   rx   pop_submit)r   r   s     r!   commitzShepherd.commit   s    
 ? 	!LL))) DO#' o 	$++A..ILL### o 	$ 	$ 	$ 	$ 	$r#   c                 R    | j         j        j        | j         j        j        j        z  S r7   )r   rB   dirrC   by_idrE   s    r!   rq   zShepherd._by_id   s    y~!DIN$7$===r#   c                 R    | j         j        j        | j         j        j        j        z  S r7   )r   rB   r   rC   orphansrE   s    r!   rs   zShepherd._orphans   s    y~!DIN$7$???r#   c                 R    | j         j        j        | j         j        j        j        z  S r7   )r   rB   r   rC   arraysrE   s    r!   rt   zShepherd._arrays   s    y~!DIN$7$>>>r#   jobsc                     dgd |D             z   }t                               dd                    |                     t          j        |d           d S )Nscancelc                     g | ]	}|j         
S r3   )rR   rK   s     r!   
<listcomp>z$Shepherd._cancel.<locals>.<listcomp>  s    #?#?#?3CJ#?#?#?r#   z
Running %s Tcheck)r   r   joinsprun)r   r   
cancel_cmds      r!   r   zShepherd._cancel   sX    [#?#?$#?#?#??
\388J#7#7888
z&&&&&&r#   r   rA   c                 .   dt           j        d<   t          |j                  }t	          j        ||                    d                    }|j        }|dk    r#|dz  dk    rt          d          |dz  |d<   d}n|}d	|d<   |j	        }|r|j	        |z  }	|	 d
|d<   d| |d<   |j
        rd	|d<   |j        ||j        z  |d<   n||d<   |j        
|j        |d<   |d= |d= |d= |d= t                              d|            |j        d|dd| |S )N1SLURM_KILL_BAD_EXITmax_num_timeout)rA   r      r   z.Can only take <= 8 gpus, or multiple of 8 gpusnodesr	   GBmemzgpu:gresntasks_per_nodecpus_per_taskgpusmem_per_gpucpus_per_gpuone_task_per_nodezSlurm parameters %rT)job_namestderr_to_stdoutr3   )r   r   dict__dict__r(   SlurmExecutorr   r   
ValueErrorr   r   r   r   r   r   update_parameters)
r   r   rA   rf   r,   executorr   gpus_per_noder   r   s
             r!   _get_submitit_executorzShepherd._get_submitit_executor  s   ,/
()l+,,)6::6G+H+HJ J J !88ax1}} !QRRR"aiF7OMM MF7O". 	'*]:C"JJJF5M///v) 	D()F$%)1*7,:S*S'(5F$%)1*6*C'6N=!>"&'*F333"" 	!	 	 	 	 	 r#   c                    | j                                         D ]}|j        }t                              d| d           t          j        ddt          j                    d|dddgd	d	
          }d |j	        
                                                                                    d          D             }|r8t                              d| d           t          j        dg|z   d	           |                                 dS )zCheck for orphaned jobs.zFound dirty tag z`, meaning a job might have been scheduled but Dora or Slurm crashed before the job id was saved.squeuez-uz-nz-oz%iz-hT)capture_outputr   c                     g | ]}||S r3   r3   )rL   lines     r!   r   z+Shepherd._check_orphans.<locals>.<listcomp>5  s    UUUDPTU4UUUr#   
zFound orphan job ids z, will cancelr   r   N)rs   iterdirr   r   warningr   r   r   getloginstdoutdecodestripsplitr'   )r   dirtyr   procidss        r!   ry   zShepherd._check_orphans-  s$   ]**,, 
	 
	E:DNN Td T T T U U U68T2;==$dDRVW)-T; ; ;DUUDK$6$6$8$8$>$>$@$@$F$Ft$L$LUUUC 6IsIIIJJJ	{S(5555LLNNNN
	 
	r#   c              #      K   | j         |z  }|                                 	 dV  |                                 dS # |                                 w xY w)z,Context manager to enter a potential orphan.N)rs   touchr'   )r   r   tokens      r!   _enter_orphanzShepherd._enter_orphan;  sR       $	EEELLNNNNNELLNNNNs	   < Ar   c           
      	   |j         }|j        }|sd S t          |          dk    }|d         }| j                            |j                   |j        j        j        t          fd|D                       s
J d            |r't          t          d |D                                 }n|j        j        }|r| j        j        dz   |z   }n| j        j        dz   |z   }|r| j        |z  }n|j        j        }|                    d	           |D ]U}	|	j        }
| j                            |
           |
j                                        r|
j                                         V|                     |||          }g }r%| j        t          j        | j                  | _        |                     |          5  t/                      5 }r5| j        J |                    t          j        | j                             |r'|                    |                                           |j         D ]q}	r(| j        J t          j        |	j        | j                   |                    |                    t=                      | j        |	j        j                             r	 d d d            n# 1 swxY w Y   tA          ||          D ]\  }	}tC          j"        ||ftG          |	j$        d
                     tJ          &                    d|j'                   ||	_(        ||	_)        | j*        |j'        z  }|}|+                    |	j        j,        -                                           |rj|	j        j,        |j        z  }|                                r-|-                                |-                                k    sJ n|+                    |           |	j        j.        }|                                r|                                 |+                    |           | j        /                    |	j                  }| 0                    d|j'         d|	j        j         d|            	 d d d            d S # 1 swxY w Y   d S )Nr	   r   c              3   B   K   | ]}|j         j        j        k    V  d S r7   )r   rB   r
   )rL   otheruse_git_saves     r!   rM   z#Shepherd._submit.<locals>.<genexpr>O  s/      NNe58=)\9NNNNNNr#   z?All jobs inside an array must have the same value for git_save.c                 &    g | ]}|j         j        S r3   )r   r\   )rL   r   s     r!   r   z$Shepherd._submit.<locals>.<listcomp>S  s    'I'I'I'I'I'Ir#   _array__T)ro   wbzCreated job with id %szScheduled job z for sheep /)1rh   rf   lenr   init_xpr   rB   r
   allr   sortedr\   r   rt   _xp_submititrr   r%   r&   r'   r   rv   get_new_cloner   r   enter_contextenter_clonebatchassign_cloner   submitr   r   zippickledumpopenr:   r   r   rR   r8   r9   rq   
symlink_torA   r   _latest_submititget_namerY   )r   r   rh   rf   is_arrayfirstname_sigr   submitit_folderr   r   r   r   stackr8   r   submitit_linklatestr   s                     @r!   r   zShepherd._submitE  sD   ! - 	Fv;;?q		%(###x}-NNNNvNNNNN 	P 	PO	P 	PN  	$'I'I&'I'I'I J JKKHHx|H 	39>I-8DD9>C'(2D 	4"lT1OO#h3Ot,,, 	, 	,EBIb!!!!((** ,"))+++..t_lSS&( 	ID4<'/'=di'H'HD$%% $	X $	X 
^ X3???''(<T=U(V(VWWW :''(8(8999&- ^ ^E# R#7CCC -eh8PQQQKK0A0A49ehm \ \]]]]	^
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ 
^ "&$// X X
sS$Keot)D)DEEE5szBBB	$(!{SZ/ 7 7 9 9::: B &+X_7K%KM$++-- B,4466/:Q:Q:S:SSSSSS%00AAA2==?? $MMOOO!!/222y))%(33V#*VVVVPTVVWWWW/X$	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	X $	Xs8   <Q>CJ2&Q>2J6	6Q>9J6	:F6Q>>RR)'r.   r/   r0   r_   rc   r   r1   Callabler   r>   r2   r5   r{   Optionalr~   r   r   r   r   r   r   r   r(   r   r   r   r`   r   rq   rs   rt   rk   r   r   r   ry   r   re   r   r3   r#   r!   rm   rm   }   s         MS  ] cUD[1I    C(8 U    c bk%.@    
C 
BK4F 
 
 
 
" " " 'k ' ' ' ^'5u 5K 5P[ 5 5 5 5:$x0 $ $ $ $$ $ $ > > > > X> @$ @ @ @ X@ ? ? ? ? X?'BGH- ' ' ' '
&3 & &-8&=E=S& & & &P   #    ^JX JX JX JX JX JX JXr#   rm   )*r_   
contextlibr   r   dataclassesr   r   loggingpathlibr   r   r   
subprocessr   r   typingr1   r(   r    r
   confr   r   distribr   r   r   utilsr   r   r   r   	getLoggerr.   r   r   r5   r   rc   re   rm   r3   r#   r!   <module>r     s$    0 0 0 0 0 0 0 0 ( ( ( ( ( ( ( (         				     



                  * * * * * * * * % % % % % %                     
	8	$	$I I I I I I I I$: : : : : : : :z	c 	 	 	 	 9 9 9 9 9 9 9 9
RX RX RX RX RX RX RX RX RX RXr#   