
    WhYI                      h   d Z ddlmZ ddlZddlmZmZ ddlZddlZ	ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ 	 ddlmZ n# e$ r dZY nw xY weZej        j        Zej        j        Zdededede def
dZ!dededede def
dZ"dededede def
dZ#dededede def
dZ$dededede de%e         f
dZ&dededede def
dZ'dedededede f
dZ(dedededede de)e*eef         e*eef         e*eef         e*eef         f         fdZ+dej,        de fdZ- eed          	 d>d ed!ed"eded#edededed$e.de dej,        de)eeef         fd%            Z/ eej0        d&'          d ed!ed"ed(e*eef         d)e*eef         d*e*eef         d+e*eef         d#edededed$e.de de)eeef         fd,            Z1d#ede)e)eef         ef         fd-Z2d.ed#efd/Z3d0ed#edefd1Z4d ed!ed"ed2ed#edededed$e.de dej,        fd3Z5dededed$e.de d4e fd5Z6d6 Z7 ej8        d7          Z9d8e9_:        e9;                     eej<        e9                     e9=                    e6           er ej>        e9ej?        d9:           dededed$e.de dej,        fd;Z@dededed$e.de d4e fd<ZA ej8        d=          ZBd8eB_:        eB;                     eej<        eB                     eB=                    eA           er ej>        eBejC        d9:           e/D                    e5e@           dS )?a
  `jax.experimental.rnn`: GPU accelerated RNN

----------------------------------------------

This module provides experimental support to CUDNN-backed LSTM.

Currently, the only supported RNN flavor is LSTM with double-bias. We use
notations and variable names similar to
https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM

and CUDNN_LSTM entry in
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t.

Note that a bidirectional LSTM is treated as having twice the number of layers,
where a forward layer i is followed by a reverse layer i. Each direction has
its own associated weights. We use pseudo-layer to denote such layers
following CUDNN documentation
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetRNNWeightParams.

CUDNN takes an opaque 1D weight array that densely packs all the weight arrays
in a sparsely documented layout. Through trial-and-error and testing, we believe
the layout is the following. Assume 2-layer bi-LSTM with double-bias, so 4
pseudo-layers in total (forward-0, reverse-0, forward-1, reverse-1).

There are 4 kinds of weights: W_ih, W_hh, b_ih and b_hh, where

W_ih = (W_ii, W_if, W_ig, W_io) concatenated on leading axis,
W_hh = (W_hi, W_hf, W_hg, W_ho) concatenated on leading axis,
b_ih = (b_ii, b_if, b_ig, b_io) concatenated on leading axis,
b_hh = (b_hi, b_hf, b_hg, b_ho) concatenated on leading axis.

Say W_ih^0 denotates W_ih from pseudo-layer 0. The linear weights are packed
together from all pseudo-layers followed by bias weights from all pseudo-layers.
In particular, for each layer, W_ih is followed by W_hh and b_ih by b_hh.

(W_ih^0, W_hh^0, W_ih^1, W_hh^1, W_ih^2, W_hh^2, W_ih^3, W_hh^3,
 b_ih^0, b_hh^0, b_ih^1, b_hh^1, b_ih^2, b_hh^2, b_ih^3, b_hh^3)

See `get_params_shapes_in_lstm`.

Example usage:
```
  x = jax.random.normal(
      k1, (batch_size, seq_len, input_size), dtype=jnp.float32)
  h_0 = jax.random.normal(
      k2, (num_directions * num_layers, batch_size, hidden_size),
      dtype=jnp.float32)
  c_0 = jax.random.normal(
      k3, (num_directions * num_layers, batch_size, hidden_size),
      dtype=jnp.float32)
  seq_lengths = jnp.ones((batch_size,), dtype=jnp.int32) * seq_len
  weights = rnn.init_lstm_weight(k4, input_size, hidden_size, num_layers,
                                 bidirectional)
  y, h_n, c_n = rnn.lstm(
      x,
      h_0,
      c_0,
      weights,
      seq_lengths=seq_lengths,
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=num_layers,
      dropout=False,
      bidirectional=bidirectional)
```

TODO:
  - Add support for input and weight dtypes other than float32.
  - Support ragged inputs.
  - Support RNNs other than LSTM.
    )partialN)castAny)core)mlir)xla)
custom_vjp)ArrayShape)lax)gpu_rnnlayer_i
input_sizehidden_sizebidirectionalreturnc                 L    | dk    s| dk    r	|rd|z  |fS |rdnd}d|z  ||z  fS )zSShape of W_ii|W_if|W_ig|W_io.

  Note that layer_i is an index of pseudo-layers.
  r             )r   r   r   r   num_directionss        Z/var/www/html/movieo_spanner_bot/venv/lib/python3.11/site-packages/jax/experimental/rnn.py_W_ih_lr   l   sJ     \\gll}lOZ(('.QQQNO^k9::    c                     d|z  |fS )zShape of W_hi|W_hf|W_hg|W_ho.r   r   r   r   r   r   s       r   _W_hh_lr   y   s     k/;	''r   c                     d|z  fS )zShape of b_ii|b_if|b_ig|b_io.r   r   r   s       r   _b_ih_lr            k/	r   c                     d|z  fS )zShape of b_hi|b_hf|b_hg|b_ho.r   r   r   s       r   _b_hh_lr#      r!   r   
num_layersc                 @   g }|rdnd}||z  }t           t          g}t          |          D ]*}|D ]%}	 |	|| ||          }
|                    |
           &+t          t
          g}t          |          D ]*}|D ]%}	 |	|| ||          }
|                    |
           &+|S )z?Get flat param shapes in LSTM. See module docstring for layout.r   r   )r   r   rangeappendr    r#   )r   r   r$   r   layer_shapesr   num_pseudo_layerslinear_weightsiw_kindlayer_shapebias_weightss               r   _get_params_shapes_in_lstmr/      s     ,%,111. >1W%."## ' 'a  ' 'F1j+}EEk+&&&&' 7#,"## ' 'a ' 'F1j+}EEk+&&&&' 
r   c                 \    t          | |||          }t          d |D                       }|S )zGet param count in LSTM.c              3   >   K   | ]}t          j        |          V  d S N)mathprod).0shapes     r   	<genexpr>z)get_num_params_in_lstm.<locals>.<genexpr>   s,      ??DIe$$??????r   )r/   sum)r   r   r$   r   r(   param_counts         r   get_num_params_in_lstmr:      s>     ,JZ,9; ;,??,?????+	r   rngc                     t          ||||          }t          j        d|z            }t          j                            | |ft          j        | |          S )zDRandom initialize LSTM weights from U(-k, k), k=sqrt(1/hidden_size).g      ?)r6   dtypeminvalmaxval)r:   npsqrtjaxrandomuniformjnpfloat32)r;   r   r   r$   r   r9   ks          r   init_lstm_weightrH      sc     'z;
'46 6+gcK  !				+s{A2a 
 
I 
I Ir   weightsc                    t          ||||          }d}d}|rdnd}||z  }	i }
i }t          |	          D ]R}|
|fD ]K}||         }|dz  }t          j        |          }| |||z                                |          ||<   ||z  }LSi }i }t          |	          D ]R}||fD ]K}||         }|dz  }t          j        |          }| |||z                                |          ||<   ||z  }LS|
|||fS )a  Unpack cudnn LSTM weights into individual weights.

  CUDNN LSTM weight layout: (num_layers, num_directions, W_ih, W_hh, b_ih, b_hh)
  Returns W_ih, W_hh, b_ih, b_hh. e.g. W_ih[2][1] is the concat weights of
  4 weights (W_ii, W_if, W_ig, W_io), each of shape (hidden_size, input_size)
  at 2nd layer for the reverse direction. See notations from
  https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM.
  r   r   r   )r/   r&   r3   r4   reshape)rI   r   r   r$   r   flat_shapesflat_shapes_offset	w_offsetsr   r)   W_ihW_hhlr,   r6   	num_elemsb_ihb_hhs                     r   unpack_lstm_weightsrU      su    +:{J+8: :+)%,111. >1$$"##  a,  ,-eA)E""i)I	$99:BB5IIfQi9ii $$"##  a,  ,-eA)E""i)I	$99:BB5IIfQi9ii 
tT4	r   	precisionc                    t          j        |           } | (t          | t                    rt	          |           dk    sdS t          t          t           j        t           j        f         |           \  } }| t           j        j        k    rdS | t           j        j        k    rdS | t           j        j	        k    rt          d          t          d|            )Nr   TFz)bfloat16 support not implemented for LSTMz%Unexpected precision specifier value )r   canonicalize_precision
isinstancetuplelenr   	PrecisionHIGHESTHIGHDEFAULTNotImplementedError
ValueError)rV   _s     r   _lstm_cudnn_allow_tf32rc      s     (33)z)U;;IRS@S@S4eCM3=899EE,)Q#-'''5CM&&&4CM)))
I
J
JJ
HYHH
I
IIr   )            	   
   )nondiff_argnumsxh_0c_0seq_lengthsdropoutc                 N    t          | |||||||||	|
          \  \  }}}}|||fS )a`  LSTM via CuDNN or HIPDNN (not-yet-supported).

  Assume batch-first inputs.

  Arguments:
    x: (batch_size, max_seq_length, input_size)
    h_0: (num_directions * num_layers, batch_size, hidden_size)
    c_0: (num_directions * num_layers, batch_size, hidden_size)
    weights: (num_params,) where num_params = get_num_params_in_lstm(...)
    seq_lengths: (batch_size,)
  Returns: (y, h_n, c_n, reserve_space).
    y: (batch_size, max_seq_length, hidden_size * num_directions)
    h_n: (num_directions * num_layers, batch_size, hidden_size)
    c_n: (num_directions * num_layers, batch_size, hidden_size)
  )r   r   r$   ro   r   rV   )lstm_fwd)rk   rl   rm   rI   rn   r   r   r$   ro   r   rV   yh_nc_nrb   s                  r   lstmru      sR    & 		!  -1c3 
Cr   )rg   rh   ri         )static_argnumsrO   rP   rS   rT   c           	         |j         t          j         d          k    rt          d          |dk    rt          d          d }d }|                     ddd	          }|sg }g }t	          |
          D ]}t          |||         ||         ||         ||         
          }t          ||          }t          j                            |||         ||         f|          }t          ||          \  \  }}}|
                    |           |
                    |           t          j        |          }t          j        |          }|                    ddd	          ||fS g }g }t	          |
d	z            D ]5}t          |||         ||         ||         ||         
          }t          ||          }|d	z  dk    rFt          j                            |||         ||         f|          }t          ||          \  \  }}}n}t          ||          }t          j                            |||         ||         f|          }t          ||          \  \  }}}t          ||          }t          j        ||gd          }|
                    |           |
                    |           7t          j        |          }t          j        |          }|                    ddd	          ||fS )zReference implementation of LSTM.

  See https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#lstm
  https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t
  int32 `seq_lengths` can only be int32.g        zWDropout not supported in LSTM reference because we cannot determine CUDNN dropout mask.c                   | \  }}t          j        |dd          \  }}	}
}t          j        |dd          \  }}}}t          j        |dd          \  }}}}t          j        |dd          \  }}}}t          ||j        z  |d          z   ||j        z  z   |d          z             }t          ||	j        z  |d          z   ||j        z  z   |d          z             }t	          ||
j        z  |d          z   ||j        z  z   |d          z             }t          ||j        z  |d          z   ||j        z  z   |d          z             }||z  ||z  z   }|t	          |          z  }||f|fS )Nr   r   axis)rE   splitsigmoidTtanh)carryrk   rO   rP   rS   rT   hcW_iiW_ifW_igW_ioW_hiW_hfW_hgW_hob_iib_ifb_igb_iob_hib_hfb_hgb_hor+   fgos                               r   	lstm_cellzlstm_ref.<locals>.lstm_cell0  su   DAq YtQQ777D$d YtQQ777D$d YtQQ777D$d YtQQ777D$dDF
T$Z'!df*4tDzABBADF
T$Z'!df*4tDzABBAQZ$t*$q46z1DJ>??ADF
T$Z'!df*4tDzABBA	AAA	DGGAq619r   c                 ,     | ||          \  }}|||ffS r2   r   )cellr   rk   rr   s       r   scan_fnzlstm_ref.<locals>.scan_fnA  s$    tE1~~HE15!*r   r   r   r   )rO   rP   rS   rT   r}   )r=   rE   r`   	transposer&   r   rB   r   scan_extract_outputr'   stack_flip_sequenceconcatenate)rk   rl   rm   rO   rP   rS   rT   rn   r   r   r$   ro   r   r   r   seq_first_yfinal_hfinal_crQ   r   cell_fnouth_tc_trs   rt   seq_first_y_fwdseq_first_y_reversedseq_first_y_bwds                                r   lstm_refr     s%    #)G,,,,
@
A
AA^^
a    "   Aq!$$+	 4GG:  
$q'Qd1gDGM M Md&&gGLL3q63q6"2-8: :c /S A AjsC+nnSnnS
)G

C
)G

C  Aq))333 ''a    aQd1gDG$q'K K KDgt$$G1uzzGLL3q63q6"21<> >c$3K$E$E!jsC// ,KEEGLL
CFCF#%9; ;c$3K$E$E!jsC/&DDoO_o$FRPPPkNN3NN3	'#	'#			q!Q	'	'c	11r   c                    |\  }\  \  }}}t          ||           }t          ||           }| d          t          j        |j        d         t          j                  d d d f         k    }t          j        |d         |d          }||f|fS )Nr   )r=   ).N)_select_last_carryrE   aranger6   rz   where)	rn   r   rb   hscsr   r   r   masks	            r   r   r   q  s    "!hr22{++#2{++# 
T	SZ(9!(<CINNNqqqRVwW	W$	
9o	 	+ s[	  r   	carry_seqc                 V    | |dz
  t          j        | j        d                   f         S )Nr   )rE   r   r6   )r   rn   s     r   r   r     s'    	;?CJyq/A$B$BB	CCr   	sequencesc                     | j         d         }||z
  } t          j        t          t          j        d          dd          | |          d d d         S )Nr   r}   )r   r   r   )in_axesout_axesr   )r6   rB   vmapr   rE   roll)r   rn   	max_stepsroll_amountss       r   r   r     sh    oa )[(,
'#(+++V
 
 
\
+ 
++/4R4
1 1r   wc                     |j         t          j         d          k    rt          d          t          |
          }t                              | |||||||||	|          \  }}}}|||f| ||||||ffS )Nrz   r{   r   r   r$   ro   r   cudnn_allow_tf32)r=   rE   r`   rc   	rnn_fwd_pbind)rk   rl   rm   r   rn   r   r   r$   ro   r   rV   r   rr   rs   rt   reserve_spaces                   r   rq   rq     s     #)G,,,,
@
A
AA+I66(~~		!'  .  )  )!S#} S#CaaG	GGr   r   c           
         | j         d         | j         d         }}|	rdnd}||||z  f}t          j        || j                  }t	          j        |||||||	|
          \  }}t          j        |ft          j                  }||||fS )Nr   r   r   )r6   r   ShapedArrayr=   r   )compute_rnn_workspace_reserve_space_sizesrE   rF   )x_avalh_0_avalc_0_avalw_avalseq_lengths_avalr   r   r$   ro   r   r   
batch_sizemax_seq_lengthr   output_shapeoutput_avalrb   reserve_space_sizereserve_space_avals                      r   rnn_abstract_evalr     s      &|AQn*%,111.nn{.JK, v|<<+7
k:z>
="24 4 ! ');(=s{KK	h*<	<<r   c                    ~ | |i |S r2   r   )fnr   argskws       r   _gpu_lowering_strip_tf32r     s    	TRr   rnn_fwdTcuda)platformc                     t          |          }|\  }	}
}}}}}|\  }}}t                              ||||	|
|||||| |||||          \  }}}}||||t          j        |          fS )Nr   )rc   	rnn_bwd_pr   rE   
zeros_like)r   r   r$   ro   r   rV   	residuals	gradientsr   rk   rl   rm   r   rn   rr   r   dydh_ndc_ndxdh_0dc_0dws                          r   lstm_bwdr     s     ,I662;/!S#q+q-."dD ~~

		!'! & ) )"dD"" dD"cn[99	::r   c                     ||||fS r2   r   )dy_avaldhn_avaldcn_avalr   h0_avalc0_avalr   y_avalr   r   r   r   r$   ro   r   r   s                   r   rnn_bwd_abstract_evalr     s    
 
'6	))r   rnn_bwdr2   )E__doc__	functoolsr   r3   typingr   r   rB   numpyr@   jax._srcr   jax.interpretersr   r   jax._src.custom_derivativesr	   jax._src.typingr
   r   jax._src.laxr   	jax.numpyrE   jax._src.libr   ImportErrorPRNGKeyArraynnr   r   intboolr   r   r    r#   listr/   r:   rH   rZ   dictrU   PrecisionLikerc   floatru   jitr   r   r   r   rq   r   r   	Primitiver   multiple_resultsdef_implapply_primitivedef_abstract_evalregister_loweringcudnn_rnn_loweringr   r   r   cudnn_rnn_bwd_loweringdefvjpr   r   r   <module>r     s  F FN                



           ! ! ! ! ! !             2 2 2 2 2 2 ( ( ( ( ( ( ( (            """""""   ''' 
&.
v{
;S 
;c 
; 
;
;$)
; 
; 
; 
;(S (c ( (($)( ( ( (S c  $)   S c  $)   3 S +..27;E{   *s  # *.36   I, IC Ic I!$I59I I I I' '  #' 25' CF' '  4U
T#u*-tCJ/?cFKGL BM M N'  '  '  ' TJc&7 JD J J J J0 	%8999 >B E  E E  '*8;FK),):FKESXZ_L_F`   :9B 	!3444W2 W2E W2 W2T#u*5E W2U
#W2+/U
+;W2U
#W227W2EHW2 W2 ,/W2 :?W2 !	W2 &+5%+>%?	W2 W2 W2 54W2r! !eE5L6I56P0Q ! ! ! !D% De D D D D1e 1% 1E 1 1 1 1H HE H H% He HH+.H<?HJOH H-0->H H H H*="%=47=EH=$=59= )-= = = =    DN9%%	!	  	  773.	:: ; ; ; 	  - . . .
 Q$G$>PPPP; ;3 ;C ;% ; ;-0->; ; ; ;4*8;*JM* '** 5:* KO* -1	* * * * DN9%%	!	  	  773.	:: ; ; ; 	  1 2 2 2
 B$/&B B B B Hh     s   
A AA