
    o i0                        d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
Zd dlmZ e G d d                      Z	 	 ddej        d	ee         d
edee         fdZd ed          fdej        dee         d
edede	eej                 eeeef                  f         f
dZ G d d          Zej        d             Z G d d          ZdS )    N)	dataclass)DictListOptionalTuple)get_assets_pathc                   z    e Zd ZU dZdZeed<   dZeed<   dZe	ed<    ed          Z
eed	<   d
Ze	ed<   dZe	ed<   dS )
VadOptionsar  VAD options.

    Attributes:
      threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
      neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
        than neg_threshold, it is always considered silence. Values higher than neg_threshold
        are only considered speech if the previous sample was classified as speech; otherwise,
        they are treated as silence. This parameter helps refine the detection of speech
         transitions, ensuring smoother segment boundaries.
      min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
      max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
        than max_speech_duration_s will be split at the timestamp of the last silence that
        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
        split aggressively just before max_speech_duration_s.
      min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
        before separating it
      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
    g      ?	thresholdNneg_thresholdr   min_speech_duration_msinfmax_speech_duration_si  min_silence_duration_msi  speech_pad_ms)__name__
__module____qualname____doc__r   float__annotations__r   r   intr   r   r        R/var/www/html/auto_sub_bot/venv/lib/python3.11/site-packages/faster_whisper/vad.pyr
   r
      s          * IuM5"#C####(5<<5///#'S'''M3r   r
   >  audiovad_optionssampling_ratereturnc           	         |t          di |}|j        }|j        }|j        }|j        }|j        }d}	|j        }
||z  dz  }||
z  dz  }||z  |	z
  d|z  z
  }||z  dz  }|dz  dz  }t          |           }t                      }t          j
        | d|	| j        d         |	z  z
  f          } ||          }d}g }i }|t          |dz
  d	          }d}dx}}t          |          D ]\  }}||k    r|rd}||k     r|	|z  }||k    r|sd
}|	|z  |d<   .|rm|	|z  |d         z
  |k    r[|r1||d<   |                    |           i }||k     rd}n||d<   dx}x}}n(|	|z  |d<   |                    |           i }dx}x}}d}||k     r_|r]|s|	|z  }|	|z  |z
  |k    r|}|	|z  |z
  |k     r||d<   |d         |d         z
  |k    r|                    |           i }dx}x}}d}|r)||d         z
  |k    r||d<   |                    |           t          |          D ]\\  }}|dk    r)t          t          d|d         |z
                      |d<   |t          |          dz
  k    r||dz            d         |d         z
  }|d|z  k     r_|dxx         t          |dz            z  cc<   t          t          d||dz            d         |dz  z
                      ||dz            d<   t          t!          ||d         |z                       |d<   t          t          d||dz            d         |z
                      ||dz            d<   3t          t!          ||d         |z                       |d<   ^|S )a  This method is used for splitting long audios into speech chunks using silero VAD.

    Args:
      audio: One dimensional float array.
      vad_options: Options for VAD processing.
      sampling rate: Sampling rate of the audio.
      kwargs: VAD options passed as keyword arguments for backward compatibility.

    Returns:
      List of dicts containing begin and end samples of each speech chunk.
    N   i     b   r   Fg333333?g{Gz?Tstartend   r   )r
   r   r   r   r   r   r   lenget_vad_modelnppadshapemax	enumerateappendr   min)r   r   r   kwargsr   r   r   r   r   window_size_samplesr   min_speech_samplesspeech_pad_samplesmax_speech_samplesmin_silence_samples!min_silence_samples_at_max_speechaudio_length_samplesmodelpadded_audiospeech_probs	triggeredspeechescurrent_speechtemp_endprev_end
next_startispeech_probspeechsilence_durations                                 r   get_speech_timestampsrF   -   s   "  **6**%I-M(?'=)A-M&)??$F&6=--
	
 
 	! 
 (*AADH(5(:T(A%u::OOE6&Q:M)MMN L 5&&LIHNI,d33 HHz#L11 2 2;9$$($HH$$014
9$$i$I&9A&=N7# 	$q(N7,CCFXXX (0u%///!#(( %II.8N7+3444:(;a(?u%///!#3444:!	-''Y' 3.2#a'836WWW##a'836III(0u%"5)N7,CC&' ' OON333!#3444:!	 	(!N7$;;?QQQ 4u'''x((  	666!#a;M)M"N"NOOF7OH!!!'Aw7&-G!&8"888u%5%:!;!;;+.8AE?736F!6KKLL, ,Q(( !$,fUm>P.PQQ! !u ,/8AE?736HHII, ,Q((  (&-:L*LMM F5MM Or   r   chunksmax_durationc                    |s*ddg d}t          j        g t           j                  g|gfS g }g }g }d}d}	t          j        g t           j                  }
|D ]}||d         z   |d         z
  ||z  k    re|                    |
           |	|z  ||z  |d}|	|z  }	|                    |           g }| |d         |d                  }
|d         |d         z
  }|                    |           t          j        |
| |d         |d                  f          }
||d         |d         z
  z  }|                    |
           |	|z  ||z  |d}|                    |           ||fS )zPThis function merges the chunks of audio into chunks of max_duration (s) length.r   )offsetdurationsegmentsdtyper&   r%   )r*   arrayfloat32r/   concatenate)r   rG   r   rH   chunk_metadataaudio_chunkschunks_metadatacurrent_segmentscurrent_durationtotal_durationcurrent_audiochunks               r   collect_chunksrZ      s     B
 

 2:.../.1AAALONHRrz222M > >uU|+eGn<]*+ + ...(=8,}<, N
 ..N"">222!!%.5<"?@M$U|eGn<##E***NeGnuU|&C DE M euW~ ==&&& !=0$}4$ N
 >***((r   c            	       t    e Zd ZdZddee         dedefdZ	 	 dd	ed
e	e         de
defdZdd	ede
defdZdS )SpeechTimestampsMapz3Helper class to restore original speech timestamps.r#   rG   r   time_precisionc                     || _         || _        g | _        g | _        d}d}|D ]X}||d         |z
  z  }|d         }| j                            |d         |z
             | j                            ||z             Yd S )Nr   r%   r&   )r   r]   chunk_end_sampletotal_silence_beforer/   )selfrG   r   r]   previous_endsilent_samplesrY   s          r   __init__zSpeechTimestampsMap.__init__   s    *, "$&! 	M 	MEeGn|;;N <L!((u)FGGG%,,^m-KLLLL	M 	Mr   NFtimechunk_indexis_endr    c                 |    ||                      ||          }| j        |         }t          ||z   | j                  S )N)get_chunk_indexr`   roundr]   )ra   re   rf   rg   r`   s        r   get_original_timez%SpeechTimestampsMap.get_original_time	  sF     ..tV<<K#8E)D0$2EFFFr   c                     t          || j        z            }|| j        v r|r| j                            |          S t	          t          j        | j        |          t          | j                  dz
            S )Nr'   )r   r   r_   indexr0   bisectr(   )ra   re   rg   samples       r   ri   z#SpeechTimestampsMap.get_chunk_index  sx    TD..//T***v*(..v666M$/88%&&*
 
 	
r   )r#   )NF)F)r   r   r   r   r   dictr   rd   r   r   boolrk   ri   r   r   r   r\   r\      s        ==M MtDz M# Ms M M M M& &*	
G 
G
G c]
G 	
G
 

G 
G 
G 
G
 
E 
4 
C 
 
 
 
 
 
r   r\   c                  x    t           j                            t                      d          } t	          |           S )zReturns the VAD model instance.zsilero_vad_v6.onnx)ospathjoinr   SileroVADModel)rt   s    r   r)   r)      s.     7<<))+?@@D$r   c                   6    e Zd Zd Z	 d	dej        dedefdZdS )
rv   c                     	 dd l }n"# t          $ r}t          d          |d }~ww xY w|                                }d|_        d|_        d|_        d|_        |                    |dg|          | _	        d S )Nr   z8Applying the VAD filter requires the onnxruntime packager'   F   CPUExecutionProvider)	providerssess_options)
onnxruntimeImportErrorRuntimeErrorSessionOptionsinter_op_num_threadsintra_op_num_threadsenable_cpu_mem_arenalog_severity_levelInferenceSessionsession)ra   rt   r}   eoptss        r   rd   zSileroVADModel.__init__(  s    	 	 	 	J 	
 ))++$%!$%!$)!"#"33-. 4 
 
s    
&!&r"   @   r   num_samplescontext_size_samplesc                    |j         dk    s
J d            |j        d         |z  dk    s
J d            t          j        dd          }t          j        dd          }t          j        d|fd          }|                    d|          }|d	| d f         }d|d<   t          j        |dd          }t          j        ||gd          }|                    d||z             }d
}|j        d         }	g }
t          d|	|          D ]E}| j        	                    d ||||z            ||d          \  }}}|

                    |           Ft          j        |
d          }|S )Nr'   zInput should be a 1D arrayr   z.Input size should be a multiple of num_samples)r'   r'      rP   rM   .i'  )inputhc)axis)ndimr,   r*   zerosreshaperollrQ   ranger   runr/   )ra   r   r   r   r   r   contextbatched_audioencoder_batch_sizenum_segmentsoutputsrB   outputouts                 r   __call__zSileroVADModel.__call__<  s    zQ <KN[(A---; .-- H[	222H[	222($%
 
 

 b+66&:%:%;%; ;<''1a(('?CC%--b+@T2TUU"$*1-q,(:;; 	# 	#A<++'A0B,B(BC!RSTT LFAq NN6""""nW1---
r   N)r"   r   )r   r   r   rd   r*   ndarrayr   r   r   r   r   rv   rv   '  s]        
 
 
* VX# #Z#.1#OR# # # # # #r   rv   )Nr   )rn   	functoolsrs   dataclassesr   typingr   r   r   r   numpyr*   faster_whisper.utilsr   r
   r   r   rp   rF   r   strrZ   r\   	lru_cacher)   rv   r   r   r   <module>r      s        				 ! ! ! ! ! ! . . . . . . . . . . . .     0 0 0 0 0 0        @ )-J J:J*%J J
 
$ZJ J J J` %,,	9) 9):9)J9) 9) 	9)
 4
T$sEz"23349) 9) 9) 9)x'
 '
 '
 '
 '
 '
 '
 '
T      8 8 8 8 8 8 8 8 8 8r   