
    ^hG                        d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z!  ej"        d
          Z# ej$        d          Z$dej%        vr e&e$          ej%        d<   ddl'Z'ddl(m)Z)m*Z*m+Z+ d Z,d Z-de.de.fdZ/d Z0d Z1d Z2e3dk    r e2             dS dS )a   Benchmarking the inference of pretrained transformer models.
PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
One difference is that random input_ids is generated in this benchmark.

For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

Example commands:
    Export all models to ONNX, optimize and validate them:
        python benchmark.py -b 0 -o -v -i 1 2 3
    Run OnnxRuntime on GPU for all models:
        python benchmark.py -g
    Run OnnxRuntime on GPU for all models with fp32 optimization:
        python benchmark.py -g -o
    Run OnnxRuntime on GPU with fp16 optimization:
        python benchmark.py -g -o -p "fp16"
    Run TorchScript on GPU for all models:
        python benchmark.py -e torchscript -g
    Run TorchScript on GPU for all models with fp16:
        python benchmark.py -e torchscript -g -p "fp16"
    Run ONNXRuntime and TorchScript on CPU for all models with quantization:
        python benchmark.py -e torchscript onnxruntime -p "int8" -o
    Run OnnxRuntime with the ROCM provider and graph optimization script:
        python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
    Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
        python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)logicalOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc                 	   dd l }g }| rtd|                                vr^d|                                vrHd|                                vr2d|                                vrt                              d           |S d}|dk    r@t          j        }d}d	|                                vrt                              d
           |S |t          j        k    rt                              d| d           |D ]}t          |         d         }|
D ]}|t          |          k    r n|d |         }t          |         d         |_	        t          j        |          }d|v rt          j                    5  t          |t          |         d         t          |         d         t          |         d         |||||| |||||||          \  }} }!}"d d d            n# 1 swxY w Y   d|v rWt          |t          |         d         t          |         d         t          |         d         |||||| |||||||          \  }} }!}"| s4t!          || |d|||          }#|#Nd |#                                D             }$g }%| rdnd}&t%          j        ||          }'t)          j        t-          |          t-          |          t-          |!|'j                  g          }(t)          j        t-          |          |'j        g          })|D ]}*|*dk    r
|D ]}+|"|+|"k    rd|v rt(          j        nt(          j        },t5          |!|*|+||'|,          }-d|j        ||&||| ||||*|+|                                t;          t=          j                              d}.|'j	        dv r/t                               d| d|*d|'j!        |'j!        g            n"t                               d| d|*|+g            |rtE          |#|-|.|	|*|          }/n|##                    |$|-          }0|(g}1tI          t          |0                    D ]J}2|2dk    r-t          |         d         dk    r|1%                    |)           5|1%                    |(           Kd|v rt(          j&        nt(          j'        }3tQ          |#|-|.|	|$|0|%|1|*|&|3|          }/t                               |/           |%                    |/           ې|S )Nr   CUDAExecutionProviderMIGraphXExecutionProviderROCMExecutionProviderDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.tensorrt   TensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)enable_all_optimizationnum_threadsverbose(enable_mlas_gemm_fastmath_arm64_bfloat16c                     g | ]	}|j         
S  )name).0node_args     b/var/www/html/auto_sub_bot/venv/lib/python3.11/site-packages/onnxruntime/transformers/benchmark.py
<listcomp>z#run_onnxruntime.<locals>.<listcomp>   s    XXX(XXX    cudacpu	cache_dironnxruntimeenginer   	providersdevice	optimizer	precision
io_binding
model_nameinputsthreads
batch_sizesequence_lengthcustom_layer_numr   vitswinzRun onnxruntime on  with input shape gpt))r;   get_available_providersloggererrorr   NOOPTwarningr   len
model_typer   parsetorchno_gradr   r   r   get_outputsr   from_pretrainednumpyprodmaxhidden_sizeint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer	   runrangeappendlonglongintcr
   )4use_gpuprovidermodel_namesmodel_classconfig_modifierrA   r,   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr:   onnx_dirr-   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr;   resultswarm_up_repeatrC   all_input_names
num_inputsinput_namesfusion_optionsonnx_model_fileis_valid_onnx_model
vocab_sizemax_sequence_lengthort_sessionort_output_namesoutput_buffersr?   configmax_last_state_sizemax_pooler_sizerF   rG   input_value_type
ort_inputsresult_templateresultort_outputsoutput_buffer_max_sizesi	data_types4                                                       r4   run_onnxruntimer   Y   s   2 G
$K,O,O,Q,QQQ(0S0S0U0UUU$K,O,O,Q,QQQ#;+N+N+P+PPP ]	
 	
 	
 N:&,&k.Q.Q.S.SSSLLz   N,,,x~xxx	
 	
 	
 " e+ e+
 ,Q/& c	+ c	+JC0000)+:+6K$Z03DO*066N|##]__   2"z*1-z*1-z*1-#'! #!&%.!/&# '+"+              2 |## .:&q):&q):&q)#"!*+"# #''* ' 4(,'9a  K "XXk>U>U>W>WXXXN&1VVEF/
iPPPF"'*$$())
F$677# # $j#k*:*:F<N)OPPO) L+ L+
??'7 I+ I+O*6?M`;`;` 6:l6J6Ju{{PUP[$!9""'#(" "J #0#.#:%-"(%3%.*@&@&0",#.&0+:,;,I,I,K,K$'$7$7' 'O" (O;; H*  H  HQ[]^`f`qsy  tE  QF  H  H    $w*$w$wYcetXu$w$wxxx- "!.'&+(&*" " '2oo6F
&S&S3F2G/!&s;'7'7!8!8 T TA Avv&*<Q*?5*H*H 7 > > O O O O 7 > >?R S S S S6:l6J6JENNPUPZ	!>'&+(,'*3&"%*" " KK'''NN6****SI+L+\ Ns   3AGGGc                    g }| r:t           j                                        st                              d           |S t          j        d           |D ]}}t          j        ||	|          }|                    |           t          ||||          }|j
        dv r
|d         g}nt          j        ||          }|j        }t                              d|            t                              d	|                                            |t          j        k    r|                                 t          j        | rd
nd          }|                    |           |t          j        k    rt+          j        |          }|D ]F}|dk    r
|D ]8}|j
        dv rzt                              d| d|d|j        |j        g            t          j        |d|j        |j        f|t          j        k    rt           j        nt           j        |          nX|||k    rt                              d| d||g            t          j        d|j        dz
  ||ft           j        |          	 |	r t           j                             |          n|
rt          j!        |          n|            tE          j#        fd|d          }|	rdn|
rdndt           j$        d| rdndd|d|d||||%                                tM          tO          j(                              d}|)                    tU          ||                     t                              |           |+                    |           # tX          $ rC}t          -                    |           t           j        .                                 Y d }~2d }~ww xY wH|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr:   )r   r:   custom_model_classrI   r   r9   zModel zNumber of parameters zcuda:0r8   zRun PyTorch on rL   r&   )sizedtyper?   r(   )lowhighr   r   r?   c                                  S Nr0   )	inference	input_idss   r4   <lambda>zrun_pytorch.<locals>.<lambda>  s    YYy5I5I r6   repeatnumberr   torch2rV   NAr7   r   r<   )/rV   r7   is_availablerO   rP   set_grad_enabledr   rY   modifyr   rT   r   model_max_lengthdebugnum_parametersr   FLOAT16halfr?   toINT8r   quantize_torch_modelrd   re   randnfloat16float32randintr   longjittracecompiletimeitr   r`   ra   rb   r   rc   updater   rh   RuntimeError	exceptionempty_cache)rk   rm   rn   ro   rA   r,   rp   rq   rr   r   r   r:   r-   r~   rC   r   model	tokenizermax_input_sizer?   rF   rG   runtimesr   er   r   s                            @@r4   run_pytorchr   :  sU    G uz..00 pqqq	5!!!! U- U-
+JK[deeev&&&%*	
 
 
 // 0 34%5jIVVVI&7N%e%%&&&EU-A-A-C-CEEFFF	)))JJLLL'<hhu==	&&"7>>E% 7	- 7	-JQ#3 3- 3-$77KK*UVX^Xikqk|H}   !&(!V->@QR/8I<M/M/MemmSXS`%! ! !II &1o6V6V KK o* o oQ[]lPm o oppp %#.2(/:#j%! ! !I-=Hw	y999flNwem\aNbNbNbrw  Ii(((%}-I-I-I-I-IR^ghiiiH 4?"c--PVDcHH\c#(#4%),3">&&%'%.&(&0"##.&0+:,;,I,I,K,K$'$7$7 F  MM"4Xz"J"JKKKKK'''NN6****# - - -$$Q'''J**,,,,,,,,-c3-	7	-r Ns   7D	N
O	8O
	
O	do_eager_modeuse_xlac                 2     ddl m dd l fd}|S )Nr   )wrapsc                                   fd            }                                            fd                        }du rdu s
J d            |S |S )Nc                       | i |S r   r0   r}   kwargsfuncs     r4   run_in_eager_modezFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode  s    4((((r6   )experimental_compilec                       | i |S r   r0   r   s     r4   run_in_graph_modezFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_mode  s     4((((r6   TFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)function)r   r   r   r   r*   r   r   s   `  r4   run_funcz+run_with_tf_optimizations.<locals>.run_func  s    	t	) 	) 	) 	) 
	) 
t	'	2	2	) 	) 	) 	) 
3	2 
	) D  e###u $## %$$$r6   )	functoolsr   
tensorflow)r   r   r   r*   r   s   `` @@r4   run_with_tf_optimizationsr     sS    % % % % % % % %$ Or6   c                    ! g }dd l !!j        j                            |           | s!j                            g d           | r5!j                                        st                              d           |S | r!j        	                    d          }	 !j                            |d         d           !j        j
                            |d         d           !j                            d           n1# t          $ r$}t                              |           Y d }~nd }~ww xY w|t           j        k    s|t           j        k    rt'          d          |D ]}t)          j        ||	          |                               t/          ||	|d	          t1          j        ||	          }|j        }|D ]/}|dk    r
|D ]!}|||k    rt                              d
| d||g            dd l}|                                  fdt;          ||z            D             }!                    |||f!j                  	 tA          dd          fd            }tA          dd          fd            }tA          dd          !fd            }|j!        r|ntE          tF                    r|              tI          j%        fd|d          }d!j&        d| rdndd|d|d||||'                                tQ          tS          j*                              d}|+                    tY          ||                     t                              |           |-                    |           # t          $ rS}t                              |           ddl.m/} |0                                }|1                                 Y d }~d }~ww xY w1|S )Nr   GPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r?   z+Mixed precision is currently not supported.r9   )r   r:   r   is_tf_modelzRun Tensorflow on rL   c                 L    g | ] }                     d j        dz
            !S )r   r(   )r   r   )r2   r   r   rngs     r4   r5   z"run_tensorflow.<locals>.<listcomp>  s/    mmmA#++a):Q)>??mmmr6   )shaper   F)r   r   c                        d          S )NF)trainingr0   r   r   s   r4   encoder_forwardz'run_tensorflow.<locals>.encoder_forward  s    $uY????r6   c                          d          S )NF)decoder_input_idsr   r0   r   s   r4   encoder_decoder_forwardz/run_tensorflow.<locals>.encoder_decoder_forward  s    $uY)V[\\\\r6   c                      j                             ddj        g          } j                             ddj        g          } | |d          S )Nr(   F)visual_feats
visual_posr   )randomnormalvisual_feat_dimvisual_pos_dim)featsposr   r   r   r*   s     r4   lxmert_forwardz&run_tensorflow.<locals>.lxmert_forward  sf     "	 0 0!Q8N1O P P i..1f6K/LMM$u%).'*%*	      r6   c                                    S r   r0   )r   s   r4   r   z run_tensorflow.<locals>.<lambda>'  s    YY[[ r6   r(   r   r   r   r7   r8   r   r<   )r7   )2r   r   	threading set_intra_op_parallelism_threadsset_visible_devicestestis_built_with_cudarO   rP   list_physical_devicesexperimentalset_memory_growth
distributeOneDeviceStrategyr   r   r   r   r   NotImplementedErrorr   rY   r   r   r   r   rd   r   Randomrg   constantr_   r   is_encoder_decoder
isinstancer   r   r   r`   ra   rb   r   rc   r   r   rh   numbar7   get_current_devicereset)"rk   rm   rn   ro   rA   r,   rp   rq   rr   r:   r-   r~   physical_devicesr   rC   r   r   rF   rG   r   valuesr   r   r   r   r   r7   r?   r   r   r   r   r   r*   s"                               @@@@@@r4   run_tensorflowr    s     GI88EEE 1
	%%b%000 rw1133 mnnn  9::5AA	 I))*:1*=uEEEI"445Ea5H$OOOM++8+<<<< 	  	  	 Q	  I%%%in)D)D!"OPPP! W# W#
+J)LLLv&&&%*
 
 
 "1*	RRR	"3% G	# G	#JQ#3 C# C#!-/N2R2RnnnPZ\kOlnnooommoommmmmz\kOkIlIlmmmKKz?6S[][cKdd	7#.UERRR@ @ @ @ @ SR@ /UERRR] ] ] ] ] SR] /UERRR       SR !0I0 3$;		#FL99 3$2	IKKK%}-@-@-@-@^_```H #/#%>%),3">&&%'%.&(&0"##.&0+:,;,I,I,K,K$'$7$7 F  MM"4Xz"J"JKKKKK'''NN6****# # # #$$Q'''******!4466FLLNNNNNNNN#}C#	G	#R Ns3   A"D   
D.
D))D.D<N
O/	AO*	*O/	c                  N   t          j                    } |                     ddddt          g dt	          t          j                              dd                    t          j                              z              |                     d	dd
t          dddgd           |                     ddt          d t	          t                    dd                    t                    z              |                     ddddt          dgg dd           |                     dddt          t          j
                            dd          d           |                     ddt          t          j
                            dd          d           |                     dd dd!d"#           |                     d$dt          d d%           |                     d&d't          t          j        t	          t                    d()           |                     d*dd!d+#           |                     d,dd!d-#           |                     d.d/t          t          j        t	          t                    d0)           |                     d1d2dd!d3#           |                     d4d5dd d67           |                     d8d9dd d:7           |                     d;d<dd d=7           |                     d>d?ddd
gt          g d@dAB           |                     dCdDddEt          dFG           |                     dHdIdt          d
gJ           |                     dKdLdt          g dMJ           |                     dNdd!dO#           |                     dP           |                     dQdRddt          dSgdTU           |                     dVdt          d dW           |                     dXdd!dY#           |                     dZ           t#          j        |            |                                 }|S )[Nz-mz--modelsF+)zbert-base-casedzroberta-basegpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer(   r'   r*   zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r  r
  r  r  r  z-ez	--enginesr;   )r;   rV   r   r   r   zEngines to benchmarkz-cz--cache_dir.cache_modelsz%Directory to cache pre-trained models)r  r
  r  r  z
--onnx_dironnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r  actionr  z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r
  r  r  r  z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r  r  r  z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_counts)r(   r)   r&   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r  r	  r  r
  r  r  z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r  r  r
  r  z-bz--batch_sizes)r	  r
  r  z-sz--sequence_lengths)             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )rx   z-nz--num_threadsr   zThreads to use)r  r	  r
  r  r  z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )r|   )argparseArgumentParseradd_argumentrb   listr   keysjoinr   ospathr   FLOAT32r   BYSCRIPTintset_defaultsr   add_arguments
parse_args)parserr}   s     r4   parse_argumentsr*  F  sI   $&&F
;;;V[]]##/$))FKMM2J2JJ  	 	 	 t(     ]##0499]3K3KK     OOO#  	 	 	 S.114     S-00-     kE,Uhiii
(     !Yu     eLOghhh
(	     &]##y     "     I     2     3     		g  	 	 	 G     oSsQCPPP
,,,     "L	     u555
     4     4W	     GGG'''DKr6   c                  
   t                      } t          | j                   | j        t          j        k    r#| j        st                              d           d S | j        t          j	        k    r,| j        r%| j
        dvrt                              d           d S t          | j                  dk    r(t          | j        d                  d         dv rdg| _        t          d	 | j        D                       | _        t                              d
|             t$          j                            | j                  sK	 t%          j        | j                   n0# t.          $ r# t                              d| j                   Y nw xY wd| j        v }d| j        v }d| j        v }d| j        v }d| j        v }|r]t3          j        t6          j                  t3          j        d          k     r)t                              dt6          j                    d S t;          | j                  }g }| j        D ]h}t7          j        |           t                               t6          j!        "                                           |s|s|r| j#        dgk    rt          $                    d           |rK|tK          | j        | j        | j&        || j        || j'        | j        | j(        dd| j        | j                  z  }|rK|tK          | j        | j        | j&        || j        || j'        | j        | j(        dd| j        | j                  z  }|rK|tK          | j        | j        | j&        || j        || j'        | j        | j(        dd| j        | j                  z  }|rI|tS          | j        | j        | j&        || j        || j'        | j        | j(        | j        | j                  z  }i }	|r	 | j*         }
|tW          | j        | j
        | j        | j&        || j        || j'        | j        | j(        | j#        | j,        | j-        | j        | j.        | j        | j/        | j0        |
|	| j1        | j2        |           z  }=# tf          $ r t          4                    d           Y dw xY wjtk          j6                    7                    d          }|	r| j8        pd| d}ts          |	|           t          |          dk    r(| j'        dgk    rt          $                    d           d S | j:        pd| d}tw          ||           | j<        pd| d}t{          |||            d S )Nzfp16 is for GPU only)migraphxrocmzint8 is for CPU onlyr(   r   r&   )rJ   swimr   c                 ,    h | ]}|d k    rt           n|S )r   )	cpu_count)r2   xs     r4   	<setcomp>zmain.<locals>.<setcomp>  s$    TTTaAFFyyTTTr6   zArguments: z#Creation of the directory %s failedrV   r   r   r;   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%Sbenchmark_fusion_z.csvzNo any result available.benchmark_detail_benchmark_summary_)>r*  r   r-   rA   r   r   rk   rO   rP   r   rl   rS   modelsr   rq   sortedr,   rd   r!  r"  existsr:   mkdirOSErrorenginesr   rU   rV   r`   r   force_num_layersset_num_threadsr   
__config__parallel_infors   rR   r   rn   rp   
test_timesr  use_mask_indexr   rt   ru   rv   rw   rx   r{   r|   r3  r   r   rc   strftime
fusion_csvr   
detail_csvr   
result_csvr   )r}   enable_torchenable_torch2enable_torchscriptenable_onnxruntimeenable_tensorflowro   r~   r,   rz   ry   
time_stampcsv_filenames                r4   mainrN    s   D~***4<*+,,,~''DL'T]Rf=f=f+,,,
4;1A!7!:o!M!M!#TT4CSTTTUUD
KK$d$$%%%7>>$.)) P	PHT^$$$$ 	P 	P 	PLL>OOOOO	P dl*L,M&$,6&$,6$4 u'899GM'<R<RRR]%J[]]^^^$T%:;;OG' g. g.k***U%3355666 5	= 5	,> 5	 QC''cddd! ;LK$#N$)ONL     ;LK$#N$)ONL     ;LK$#N$)ONL     	~  %  G #% 	..-1-@)@&?LMK$#N$)O%'&NMLN/*+%A/  2  . . .  -----.9	.> ((99J HN*Nj*N*N*N !8,GGG
7||qs""NN5666?J&J*&J&J&JL7L)))?K&K:&K&K&KL7L$/////s%   E *FF%B
Q11$RR__main__)4__doc__r  loggingr!  r   r   rZ   psutilbenchmark_helperr   r   r   r   r   r	   r
   r   r   r   r   r   r   huggingface_modelsr   r   onnx_exporterr   r   r   r   	packagingr   quantize_helperr   	getLoggerrO   r0  environrb   rV   transformersr   r   r   r   r   boolr   r  r*  rN  __name__r0   r6   r4   <module>r]     s    :   				                                   ) ( ( ( ( ( 4 4 4 4 4 4 4 4                  * * * * * *		2		FU+++	 BJ&&$'C	NNBJ !  @ @ @ @ @ @ @ @ @ @^ ^ ^Bm m m`T D    4  DE E EP_0 _0 _0D zDFFFFF r6   