
    /j5                       S SK Jr  S SKrS SKrS SKJr  S SKrS SKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJrJrJrJrJr   " S S\5      r " S S\5      r\ " S S5      5       r " S S5      r g)    )annotationsN)	dataclass)rtc   )utils)LanguageCode)logger)DEFAULT_API_CONNECT_OPTIONS	NOT_GIVENAPIConnectOptions
NotGivenOr)AudioByteStream   )STTRecognizeStream
SpeechDataSpeechEventSpeechEventTypec                     ^  \ rS rSrSS\SSS.           SU 4S jjjr\\S.       SS jjr\\S.     SS	 jjrS
r	U =r
$ )MultiSpeakerAdapter   TF{text}detect_primary_speakersuppress_background_speakerprimary_detection_optionsprimary_formatbackground_formatc                  > UR                   R                  (       d  [        S5      e[        TU ]  UR                   S9  Xl        X l        X0l        U=(       d
    [        5       U l	        XPl
        X`l        g)a  MultiSpeakerAdapter is an adapter that allows to detect and suppress background speakers.
It needs STT with diarization capability and works for a single audio track.

Args:
    stt (STT): STT instance to wrap
    detect_primary_speaker (bool, optional): Whether to detect primary speaker. Defaults to True.
    suppress_background_speaker (bool, optional): Whether to suppress background speaker. Defaults to False.
    primary_detection_options (NotGivenOr[PrimarySpeakerDetectionOptions], optional): Primary speaker detection options.
        If not provided, the default options will be used.
    primary_format (str, optional): Format for primary speaker.
        Supports {text} and {speaker_id} placeholders. Defaults to "{text}".
    background_format (str, optional): Format for background speaker.
        Supports {text} and {speaker_id} placeholders. Defaults to "{text}".

Raises:
    ValueError: If the STT does not support diarization.
z9MultiSpeakerAdapter needs STT with diarization capability)capabilitiesN)r    diarization
ValueErrorsuper__init___stt_detect_primary_suppress_backgroundPrimarySpeakerDetectionOptions_opt_primary_format_background_format)selfsttr   r   r   r   r   	__class__s          Y/app/agent/.venv/lib/python3.13/site-packages/livekit/agents/stt/multi_speaker_adapter.pyr$   MultiSpeakerAdapter.__init__   sc    6 ++XYYc&6&67	5$?!-Q1O1Q	-"3    languageconn_optionsc               R   #    U R                   R                  XUS9I S h  vN $  N7f)Nr2   )r%   	recognize)r,   bufferr3   r4   s       r/   _recognize_impl#MultiSpeakerAdapter._recognize_impl;   s(      YY((Q](^^^^s   '%'c               *    [        X R                  XS9$ )N)r-   wrapped_sttr3   r4   )MultiSpeakerAdapterWrapperr%   )r,   r3   r4   s      r/   streamMultiSpeakerAdapter.streamD   s     *))h
 	
r1   )r+   r&   r)   r*   r%   r'   )r-   r   r   boolr   r?   r   *NotGivenOr[PrimarySpeakerDetectionOptions]r   strr   rA   )r7   zutils.AudioBufferr3   NotGivenOr[str]r4   r   returnr   )r3   rB   r4   r   rC   r   )__name__
__module____qualname____firstlineno__r   r$   r
   r8   r=   __static_attributes____classcell__r.   s   @r/   r   r      s    
 (,,1PY&!)%4 %4 !%	%4
 &*%4 $N%4 %4 %4 %4V %.*E_!_ "	_
 (_ 
_ %.*E	
 "
 (	

 

 
r1   r   c                  F   ^  \ rS rSr        SU 4S jjrSS jrSrU =r$ )r<   O   c                  > [         TU ]  XS9  X l        X0l        [	        UR
                  UR                  UR                  UR                  UR                  S9U l
        g )N)r-   r4   r   )r#   r$   _wrapped_stt	_language_PrimarySpeakerDetectorr&   r'   r)   r*   r+   	_detector)r,   r-   r;   r3   r4   r.   s        r/   r$   #MultiSpeakerAdapterWrapper.__init__P   sW     	S<'!0#&#6#6(+(@(@&)hh..!44
r1   c                <  ^ #    SU 4S jjnSU 4S jjnT R                   R                  T R                  T R                  S9n[        R
                  " U" U5      SS9[        R
                  " U" U5      SS9/n [        R                  " U6 I S h  vN   [        R                  R                  " U6 I S h  vN   UR                  5       I S h  vN   g  NB N N	! [        R                  R                  " U6 I S h  vN    UR                  5       I S h  vN    f = f7f)Nc                  >#    TR                     S h  vN n[        U[        R                  5      (       a.  U R	                  U5        TR
                  R                  U5        MV  [        UTR                  5      (       d  Ms  U R                  5         M   N
 [        R                  " [        5         U R                  5         S S S 5        g ! , (       d  f       g = f7f)N)	_input_ch
isinstancer   
AudioFrame
push_framerQ   
push_audio_FlushSentinelflush
contextlibsuppressRuntimeError	end_input)r=   framer,   s     r/   _forward_input7MultiSpeakerAdapterWrapper._run.<locals>._forward_inpute   s     #~~ #eeS^^44%%e,NN--e4t':':;;LLN#~ $$\2  " 322sD   CBBBA'CCBC2C	C
CCc                j  >#    U   S h  vN nTR                   R                  U5      nUb  TR                  R                  U5        MD  UR                  [
        R                  :X  d  Md  TR                  R                  [        [
        R                  [        [        S5      SS9/S95        M   N
 g 7f)N )r3   text)typealternatives)
rQ   on_stt_event	_event_chsend_nowaitrf   r   FINAL_TRANSCRIPTr   r   r   )r=   ev
updated_evr,   s      r/   _forward_output8MultiSpeakerAdapterWrapper._run.<locals>._forward_outputp   s     " b!^^88<
)NN..z:WW @ @@NN..#!0!A!A*4l2>NUW*X)YFs-   B3B1B/B1AB3*AB3/B11B3r2   z'DiarizationAdapterWrapper.forward_input)namez(DiarizationAdapterWrapper.forward_output)r=   r   rC   None)rN   r=   rO   _conn_optionsasynciocreate_taskgatherr   aiocancel_and_waitaclose)r,   ra   rn   r=   taskss   `    r/   _runMultiSpeakerAdapterWrapper._rund   s     		#	 ""))4>>PTPbPb)cv&-V '.X	
	"..%((())++U333--/!! )3! ))++U333--/!!sl   A5D9C CC  D4C5DCDC DD!D9C<:DDDD)rQ   rO   rN   )r-   r   r;   r   r3   rB   r4   r   )rC   rq   )rD   rE   rF   rG   r$   rz   rH   rI   rJ   s   @r/   r<   r<   O   s8    
 
 	

 "
 (
((" ("r1   r<   c                      \ rS rSr% SrSrS\S'    SrS\S'    S	rS\S
'    Sr	S\S'    Sr
S\S'    SrS\S'    SrS\S'   Srg)r(      z+Configuration for primary speaker detectiond   intframe_size_msg      ^@floatrms_buffer_duration   min_rms_samplesg      ?rms_smoothing_factorg?threshold_multiplier<   decay_to_equal_timethreshold_min_multiplier N)rD   rE   rF   rG   __doc__r   __annotations__r   r   r   r   r   r   rH   r   r1   r/   r(   r(      sg    5M3(!&&#OS7"%%%] #&%%Z!##N&)e)Pr1   r(   c                      \ rS rSr\ " S S5      5       rSS\SSS.         SS jjrSS	 jrSS
 jr	SS jr
SS jrSS jrSrg)rP      c                  <    \ rS rSr% S\S'   SrS\S'   SrS\S'   Srg	)
#_PrimarySpeakerDetector.SpeakerData   rA   
speaker_id        r   last_activity_timermsr   N)rD   rE   rF   rG   r   r   r   rH   r   r1   r/   SpeakerDatar      s    $'E'Ur1   r   TFr   r   c                  X@l         XPl        Xl        X l        U=(       d
    [	        5       U l        U R                  (       a.  U R                  (       d  [        R                  " S5        SU l        SU l        SU l	        0 U l
        SU l        / U l        U R
                  R                  S-  U l        [        U R
                  R                   U R                  -  5      U l        g)a  Primary speaker detector. It detects the primary speaker based on RMS,
formats the primary and background speakers separately, or suppresses the background speaker.

Args:
    detect_primary_speaker (bool, optional): Whether to detect primary speaker. Defaults to True.
    suppress_background_speaker (bool, optional): Whether to suppress background speaker. Defaults to False.
    primary_detection_options (PrimaryDetectionOptions, optional): Primary speaker detection options.
    primary_format (str, optional): Format for primary speaker.
        Supports {text} and {speaker_id} placeholders. Defaults to "{text}".
    background_format (str, optional): Format for background speaker.
        Supports {text} and {speaker_id} placeholders. Defaults to "{text}".
zVSuppressing background speaker is not supported when `detect_primary_speaker` is FalseFr   Ni  )r*   r+   r&   r'   r(   r)   r	   warning_pushed_duration_primary_speaker_speaker_data_bstream_rms_bufferr   _frame_sizer   r   _max_rms_size)r,   r   r   r   r   r   s         r/   r$    _PrimarySpeakerDetector.__init__   s    *  ."35$?!-Q1O1Q	$$T-A-ANNh ).D%'*,0MO04(*9922T9 !>!>AQAQ!QRr1   c                   U R                   (       d   U =R                  UR                  -  sl        g U R                  (       dY  [	        UR
                  U R                  -  5      n[        UR
                  UR                  US9U l        X!R
                  -  U l        U R                  R                  UR                  5       HN  nU R                  U5      nU R                  R                  U5        U =R                  UR                  -  sl        MP     [        U R                  5      U R                  :  a   U R                  U R                  * S  U l        g g )N)sample_ratenum_channelssamples_per_channel)r&   r   durationr   r   r   r   r   r   pushdata_compute_rmsr   appendlenr   )r,   r`   sample_per_channelfr   s        r/   rY   "_PrimarySpeakerDetector.push_audio   s   ##!!U^^3!}}!$U%6%69I9I%I!J+!--"//$6DM
  24E4EED##EJJ/A##A&C##C(!!QZZ/! 0
 t 4#5#55#//1C1C0C0EFD 6r1   c                2   UR                   (       d  U$ UR                   S   nUR                  [        R                  :X  a  U R	                  U5        UR
                  b  U R                  c  U$ UR
                  U R                  :H  Ul        UR                  (       a5  U R                  R                  UR                  UR
                  S9Ul
        U$ U R                  (       a  g U R                  R                  UR                  UR
                  S9Ul
        U$ )Nr   )re   r   )rg   rf   r   rk   _update_primary_speakerr   r   is_primary_speakerr*   formatre   r'   r+   )r,   rl   sds      r/   rh   $_PrimarySpeakerDetector.on_stt_event   s    I__Q77o666((,== D$9$9$AI "1F1F F   **11rww2==1YBG 		 ((--44"''bmm4\BG	r1   c                &   [         R                  " UR                  [         R                  S9n[	        U5      S:X  a  g[         R
                  " [         R                  " UR                  [         R                  5      S-  5      5      n[        U5      $ )N)dtyper   r   r   )
np
frombufferr   int16r   sqrtmeanastypefloat32r   )r,   r`   
audio_datar   s       r/   r   $_PrimarySpeakerDetector._compute_rms  s^    ]]5::RXX>
z?aggbggj//

;q@ABSzr1   c                   U R                   (       d  g [        U R                  U-
  U R                  -  5      n[        U R                  U-
  U R                  -  5      n[	        U R                   5      U-
  S-
  n[	        U R                   5      U-
  nUS:  d  U[	        U R                   5      :  a  g [        US5      nXC-
  U R                  R                  :  a  g [        [        R                  " U R                   X4 5      5      $ )Nr   r   )r   r   r   r   r   maxr)   r   r   r   median)r,   
start_timeend_timestartends        r/   _get_rms_for_timerange._PrimarySpeakerDetector._get_rms_for_timerange  s    T**Z74;K;KKL4((83t7G7GGHD$$%-1$""#c)7es4#3#344E1;222RYYt//:;<<r1   c                   UR                   b  U R                  (       d  S U l        g U R                  UR                  UR
                  5      nUc  g UR                   nU R                  R                  U5      =n(       aW  UR
                  Ul        UR                  U R                  R                  -  USU R                  R                  -
  -  -   Ul	        O,[        R                  UUR
                  US9U R                  U'   U R                  U:X  a  g U R                  b(  U R                  R                  U R                  5      =nc  X0l        [        R                  " SX2S.S9  g U R                   UR                  -
  nU R                  R"                  S:  a1  U R                  R"                  S-
  U R                  R$                  -  nOSn['        U R                  R"                  Xv-  -
  U R                  R(                  5      nUR                  U-  n	UUU	UUS.n
X):  a  X0l        [        R                  " S	U
S9  g [        R                  " S
U
S9  g )Nr   )r   r   r   zset first primary speaker)r   r   )extrag      ?r   )r   r   rms_thresholdsilence_duration
multiplierzprimary speaker switchedzprimary speaker unchanged)r   r&   r   r   r   r   r   getr   r   r)   r   rP   r   r	   debugr   r   r   r   r   )r,   r   r   r   r   primaryr   
decay_rater   r   r   s              r/   r   /_PrimarySpeakerDetector._update_primary_speaker  s   == (<(<$(D!))"--E; ]]
%%))*5545&(kkD#xx$))"@"@@3DII222D DH .E-P-P%#%;; .Q .Dz*   J. !!)--11$2G2GHHQ$.!LL4:<bc0073M3MM 99))C/))883>$))B_B__JJII**j.KLII..

  j0$* 0$
 $.!LL35ALL4EBr1   )r+   r   r&   r   r   r)   r*   r   r   r   r   r'   N)
r   r?   r   r?   r   r@   r   rA   r   rA   )r`   rtc.AudioFramerC   rq   )rl   r   rC   zSpeechEvent | None)r`   r   rC   r   )r   r   r   r   rC   zfloat | None)r   r   rC   rq   )rD   rE   rF   rG   r   r   r   r$   rY   rh   r   r   r   rH   r   r1   r/   rP   rP      s       (,,1PY&!)(S !%(S &*	(S
 $N(S (S (STG,.=$>Cr1   rP   )!
__future__r   rs   r\   dataclassesr   numpyr   livekitr   rd   r   r3   r   logr	   typesr
   r   r   r   utils.audior   r-   r   r   r   r   r   r   r<   r(   rP   r   r1   r/   <module>r      ss    "   !    #  Y Y ) O O9
# 9
x=" ="@ Q Q Q*vC vCr1   