
    o i                      X    d dl Z d dlmZ d dlmZmZmZ d dlZ G d d          ZdZ	dZ
dS )    N)cached_property)ListOptionalTuplec            
          e Zd ZdZ	 	 ddej        dedee         dee         fdZ	e
defd	            Ze
defd
            Ze
defd            Ze
defd            Ze
defd            Ze
defd            Ze
defd            Ze
defd            Zedefd            Zedee         fd            Zdedee         fdZdee         defdZdee         defdZe
dee         fd            Zdee         deee         eee                  f         fdZdee         deee         eee                  f         fdZdee         deee         eee                  f         fdZdS )	Tokenizerz-Simple wrapper around a tokenizers.Tokenizer.N	tokenizermultilingualtasklanguagec                    || _         |r|t          vr.t          d|dd                    t                    d          |t          vr.t          d|dd                    t                    d          | j                             d|z            | _        | j                             d|z            | _        || _        d S d | _        d | _        d| _        d S )N'z'' is not a valid task (accepted tasks: z, )z9' is not a valid language code (accepted language codes: z<|%s|>en)	r	   _TASKS
ValueErrorjoin_LANGUAGE_CODEStoken_to_idr   r   language_code)selfr	   r
   r   r   s        X/var/www/html/auto_sub_bot/venv/lib/python3.11/site-packages/faster_whisper/tokenizer.py__init__zTokenizer.__init__   s     # 	&6!! jttTYYv....0  
 .. jxx?!;!;!;!;=  
 228d?CCDI N66x(7JKKDM!)DDI DM!%D    returnc                 6    | j                             d          S )Nz<|transcribe|>r	   r   r   s    r   
transcribezTokenizer.transcribe*   s    ~))*:;;;r   c                 6    | j                             d          S )Nz<|translate|>r   r   s    r   	translatezTokenizer.translate.       ~))/:::r   c                 6    | j                             d          S )Nz<|startoftranscript|>r   r   s    r   sotzTokenizer.sot2   s    ~))*ABBBr   c                 6    | j                             d          S )Nz<|startoflm|>r   r   s    r   sot_lmzTokenizer.sot_lm6   r"   r   c                 6    | j                             d          S )Nz<|startofprev|>r   r   s    r   sot_prevzTokenizer.sot_prev:   s    ~))*;<<<r   c                 6    | j                             d          S )Nz<|endoftext|>r   r   s    r   eotzTokenizer.eot>   r"   r   c                 6    | j                             d          S )Nz<|notimestamps|>r   r   s    r   no_timestampszTokenizer.no_timestampsB   s    ~))*<===r   c                 j    | j                             d          p| j                             d          S )Nz<|nospeech|>z<|nocaptions|>r   r   s    r   	no_speechzTokenizer.no_speechF   s6    ~)).99 
T^=W=W>
 >
 	
r   c                     | j         dz   S )N   )r,   r   s    r   timestamp_beginzTokenizer.timestamp_beginL   s    !A%%r   c                     | j         g}| j        |                    | j                   | j        |                    | j                   |S )N)r$   r   appendr   )r   sequences     r   sot_sequencezTokenizer.sot_sequenceP   sH    H:=$OODM***9 OODI&&&r   textc                 D    | j                             |d          j        S )NF)add_special_tokens)r	   encodeids)r   r6   s     r   r9   zTokenizer.encode\   s     ~$$Te$DDHHr   tokensc                 T      fd|D             } j                             |          S )Nc                 *    g | ]}|j         k     |S  )r*   ).0tokenr   s     r   
<listcomp>z$Tokenizer.decode.<locals>.<listcomp>`   s%    EEEEDH4D4Du4D4D4Dr   )r	   decode)r   r;   text_tokenss   `  r   rB   zTokenizer.decode_   s2    EEEE&EEE~$$[111r   c                     g g}|D ]e}| j         k    r=d| j         z
  dz  dd}|                    |           |                    g            J|d                             |           fd                     fd|D                       S )Nz<|g{Gz?z.2fz|> c                 r    g | ]3}t          |t                    r|nj                            |          4S r>   )
isinstancestrr	   rB   )r?   sr   s     r   rA   z4Tokenizer.decode_with_timestamps.<locals>.<listcomp>o   s<    TTTq*Q$$BQQ$.*?*?*B*BTTTr   )r1   r3   r   )r   r;   outputsr@   	timestamps   `    r   decode_with_timestampsz Tokenizer.decode_with_timestampsc   s    $ 	* 	*E,,,N%$*>">$!FNNNN	y)))r""""""5))))wwTTTTGTTT
 
 	
r   c                 4   t          d          }|d                                z  }t          d          }t          d |D                       sJ |                     d          d         |                     d          d         h}|t          |          z   D ]d}|                     |          |                     d|z             fD ]4}t          |          d	k    s||v r|                    |d                    5et          t          |                    S )
u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c              3   P   K   | ]!}d t          |          cxk    odk    nc V  "dS )i@&  i&  N)ord)r?   cs     r   	<genexpr>z.Tokenizer.non_speech_tokens.<locals>.<genexpr>   sD      EE!6SVV----v----EEEEEEr   z -r   z ' r0   )	listsplitsetallr9   lenaddtuplesorted)r   symbolsmiscellaneousresultsymbolr;   s         r   non_speech_tokenszTokenizer.non_speech_tokensr   s*    =>>Z``bb	
 344EE}EEEEEEEE ++d##A&D(9(9!(<=] 3 33 	* 	*FF##C&L)) * * v;;!##v'>'>JJvay)))* VF^^$$$r   c                 h    | j         dv r|                     |          S |                     |          S )N>   jalomythzhyue)r   split_tokens_on_unicodesplit_tokens_on_spaces)r   r;   s     r   split_to_word_tokenszTokenizer.split_to_word_tokens   s=     !FFF //777**6222r   c                    |                      |          }d}g }g }g }d}|D ]}|                    |           |                      |          }		 |	                    |          }
|
|z  }
n# t          $ r d }
Y nw xY w|
|
t	          |          k     rJ||
         |k    r>|                    |	           |                    |           g }|t	          |	          z  }||fS )Nu   �r   )rM   r3   indexr   rX   )r   r;   decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr@   decodedreplacement_char_indexs              r   rh   z!Tokenizer.split_tokens_on_unicode   s(    226::# 	/ 	/E!!%(((11.AAG.)07G)H)H&&.8&& . . .)-&&&. &-&\):)::: !78<LLLW%%%"">222!##g,,.k!!s   A**A98A9c                    |                      |          \  }}g }g }t          ||          D ]\  }}|d         | j        k    }|                    d          }	|                                t
          j        v }
|s|	s|
st          |          dk    r+|                    |           |                    |           |d         |z   |d<   |d         	                    |           ||fS )Nr   rS   rE   )
rh   zipr*   
startswithstripstringpunctuationrX   r3   extend)r   r;   subwordssubword_tokens_listro   rp   subwordsubword_tokensspecial
with_spacerz   s              r   ri   z Tokenizer.split_tokens_on_spaces   s    )-(D(DV(L(L%%'*85H'I'I 		7 		7#G^$Q'483G ++C00J!--//V-??K 7* 7 7s5zzQW%%%"">2222!"I/b	B&&~6666k!!r   )NN) __name__
__module____qualname____doc__
tokenizersr   boolr   rI   r   r   intr   r!   r$   r&   r(   r*   r,   r.   propertyr1   r   r5   r9   rB   rM   r   r`   rj   rh   ri   r>   r   r   r   r   	   sZ       77 #"&& &'& & sm	&
 3-& & & &< <C < < < _< ;3 ; ; ; _; CS C C C _C ; ; ; ; _; =# = = = _= ;S ; ; ; _; >s > > > _> 
3 
 
 
 _

 & & & & X& 	d3i 	 	 	 X	I3 I49 I I I I2T#Y 23 2 2 2 2
T#Y 
3 
 
 
 
 !%5: !% !% !% _!%F	33i	3	tCy$tCy/)	*	3 	3 	3 	3"3i"	tCy$tCy/)	*" " " "@"3i"	tCy$tCy/)	*" " " " " "r   r   )r   r!   )dafamarasazbabebgbnbobrbscacscydadeelr   eseteufafifofrglguhahawhehihrhthuhyidisitrb   jwkakkkmknkolalblnrc   ltlvmgmimkmlmnmrmsmtrd   nenlnnnoocpaplpsptrorusasdsiskslsnsosqsrsusvswtatetgre   tktltrttukuruzviyiyorf   rg   )ry   	functoolsr   typingr   r   r   r   r   r   r   r>   r   r   <module>r      s     % % % % % % ( ( ( ( ( ( ( ( ( (    J" J" J" J" J" J" J" J"Z

er   