
    ripO              	          d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#  edd      \  Z$Z% ee$e%d      \  Z$Z% e       jM                  e$      Z$g dZ'dh ejP                         D  ch c]
  \  } }|d    c}} z  Z)dRdZ*ejV                  jY                  de      d        Z-d Z.ejV                  jY                  dg e#e"      d        Z/d Z0ejV                  jY                  de'      ejV                  jY                  d e      d!               Z1d" Z2ejV                  jY                  d#d$      d%        Z3d& Z4d' Z5d( Z6d) Z7ejV                  jY                  d*d+d,g      d-        Z8ejV                  jY                  d.e#      d/        Z9ejV                  jY                  d0e'      d1        Z:d2 Z;d3 Z<ejV                  jY                  d4d d5i ejz                  d6ej|                  gej|                  d6gg      fd d5id6d7gd7d6ggfi d6d7gd8d9ggfg      d:        Z?ejV                  jY                  d.e#      d;        Z@ejV                  jY                  d.e#      d<        ZAd= ZBd> ZCd? ZDejV                  jY                  d@dAdBg      ejV                  jY                  dCddDg      dE               ZEdF ZFejV                  jY                  dGdHdIg      dJ        ZGejV                  jY                  dKdLdMg      dN        ZHejV                  jY                  dOd+d,g      dP        ZIdQ ZJyc c}} w )SzF
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
    N)stats)distance)HDBSCAN)CONDENSED_dtype_condense_tree_do_labelling)_OUTLIER_ENCODING)
make_blobs)fowlkes_mallows_score)_VALID_METRICSeuclidean_distances)BallTreeKDTree)StandardScaler)shuffle)assert_allcloseassert_array_equal)CSC_CONTAINERSCSR_CONTAINERS   
   )	n_samplesrandom_state   )r   )kd_tree	ball_treebruteautolabelc                 r    t        t        |       t        z
        }|dk(  sJ t        | t              |kD  sJ y )N   )lensetOUTLIER_SETr   y)labels	threshold
n_clusterss      `/var/www/auto_recruiter/arenv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.pycheck_label_qualityr+   )   s6    S[;./J?? +i777    outlier_typec                    t         j                  t         j                  d|    }d d d|    }t        |    d   }t        |    d   }t        j                         }|dg|d<   ||g|d<   t        d	
      j                  |      }|j                  |k(  j                         \  }t        |ddg        ||j                  |      j                         \  }t        |ddg       t        t        dd            t        t        dd            z   }	t        d	
      j                  ||	         }
t        |
j                  |j                  |	          y)O
    Tests if np.inf and np.nan data are each treated as special outliers.
    )infinitemissingc                     | |k(  S N xr&   s     r*   <lambda>z#test_outlier_data.<locals>.<lambda>9   s
    a r,   c                 ,    t        j                  |       S r3   )npisnanr5   s     r*   r7   z#test_outlier_data.<locals>.<lambda>:   s     r,   r    prob   r      Fcopy   r   N)r9   infnanr	   Xr?   r   fitlabels_nonzeror   probabilities_listrange)r-   outlier
prob_checkr    r;   	X_outliermodelmissing_labels_idxmissing_probs_idxclean_indicesclean_models              r*   test_outlier_datarR   /   sB    FF66 G
 (+ J l+G4E\*62DIQ<IaLW%IaL##I.E"]]e3<<>)Aq62&u';';TBKKM(1a&1q!%U1c](;;Mu%)))M*BCK{**EMM-,HIr,   c                     t        t              } | j                         }t        dd      j	                  |       }t        | |       t        |       d}t        j                  t        |      5  t        dd      j	                  t               ddd       d}d| d	<   d
| d<   t        j                  t        |      5  t        dd      j	                  |        ddd       y# 1 sw Y   VxY w# 1 sw Y   yxY w)zy
    Tests that HDBSCAN works with precomputed distance matrices, and throws the
    appropriate errors when needed.
    precomputedTmetricr?   z*The precomputed distance matrix.*has shapematchNz'The precomputed distance matrix.*valuesr   )r   r<   r<   )r<   r   F)
r   rC   r?   r   fit_predictr   r+   pytestraises
ValueError)D
D_originalr'   msgs       r*   test_hdbscan_distance_matrixr`   O   s    
 	AAJM5AA!DFAz"
7C	z	- @}40<<Q?@ 5CAdGAdG	z	- A}51==a@A A@ @A As   0!C& C2&C/2C;sparse_constructorc                 b   t        j                  t        j                  t                    }|t	        j
                  |      z  }t        j                  |j                         d      }d|||k\  <    | |      }|j                          t        dd      j                  |      }t        |       y)zA
    Tests that HDBSCAN works with sparse distance matrices.
    2           rT   FrU   N)r   
squareformpdistrC   r9   maxr   scoreatpercentileflatteneliminate_zerosr   rY   r+   )ra   r]   r(   r'   s       r*   #test_hdbscan_sparse_distance_matrixrk   g   s    
 	HNN1-.ANA''		R8IAa9n1AM6BB1EFr,   c                  X    t        d      j                  t              } t        |        y)z
    Tests that HDBSCAN works with feature array, including an arbitrary
    goodness of fit check. Note that the check is a simple heuristic.
    Fr>   N)r   rY   rC   r+   r'   s    r*   test_hdbscan_feature_arrayrn   y   s#    
 % ,,Q/F r,   algorV   c                    t        | d      j                  t              }t        |       | dv ryt        t
        d}dt        j                  t        j                  d         idt        j                  t        j                  d         idd	id	t        j                  t        j                  d         d
dj                  |d      }t        | ||d      }|||    j                  vr8t        j                  t              5  |j                  t               ddd       y|dk(  r8t        j                   t"              5  |j                  t               ddd       y|j                  t               y# 1 sw Y   yxY w# 1 sw Y   yxY w)z
    Tests that HDBSCAN works with the expected combinations of algorithms and
    metrics, or raises the expected errors.
    F)	algorithmr?   )r   r   N)r   r   Vr<   p   )rs   w)mahalanobis
seuclidean	minkowski
wminkowski)rq   rV   metric_paramsr?   ry   )r   rY   rC   r+   r   r   r9   eyeshapeonesgetvalid_metricsrZ   r[   r\   rD   warnsFutureWarning)ro   rV   r'   ALGOS_TREESrz   hdbs         r*   test_hdbscan_algorithmsr      sP    t%0<<Q?F    K
 RVVAGGAJ/0BGGAGGAJ/01XBGGAGGAJ$78	
 
c&$  #	C [&444]]:& 	GGAJ	 		<	\\-( 	GGAJ	 	 	
	 		 	s   E5 F5E>F
c                  ~    t        d      j                  t              } | j                  d      }t	        |d       y)z
    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
    This test is more of a sanity check than a rigorous evaluation.
    Fr>   333333?gq=
ףp?)r(   N)r   rD   rC   dbscan_clusteringr+   )	clustererr'   s     r*   test_dbscan_clusteringr      s5    
 U#''*I((-F $/r,   cut_distance)皙?      ?r<   c                    t         d   d   }t         d   d   }t        j                         }t        j                  dg|d<   dt        j
                  g|d<   t        j                  t        j
                  g|d<   t        d	      j                  |      }|j                  | 
      }t        j                  ||k(        }t        |ddg       t        j                  ||k(        }t        |dg       t        t        t        d            t        ||z         z
        }t        d	      j                  ||         }	|	j                  | 
      }
t        |
||          y)r/   r1   r    r0   r<   r   rt   r=   Fr>   )r   r   N)r	   rC   r?   r9   rA   rB   r   rD   r   flatnonzeror   rH   r$   rI   )r   missing_labelinfinite_labelrL   rM   r'   rN   infinite_labels_idx	clean_idxrQ   clean_labelss              r*   #test_dbscan_clustering_outlier_datar      s0   
 &i09M&z27;NIFFA;IaLrvv;IaLFFBFF#IaL##I.E$$,$?F-(?@)Aq62..>)AB*QC0Ss_s+=@S+S'TTUIu%)))I*>?K00l0KL|VI%67r,   c                      t        ddt        j                  t        j                  d         id      j                  t              } t        |        y)z4
    Tests that HDBSCAN using `BallTree` works.
    rw   rr   r<   F)rV   rz   r?   N)r   r9   r}   rC   r|   rY   r+   rm   s    r*   !test_hdbscan_best_balltree_metricr      sA     C1D+EEk!n  r,   c                      t        t        t              dz
  d      j                  t              } t	        |       j                  t              sJ y)z
    Tests that HDBSCAN correctly does not generate a valid cluster when the
    `min_cluster_size` is too large for the data.
    r<   Fmin_cluster_sizer?   N)r   r#   rC   rY   r$   issubsetr%   rm   s    r*   test_hdbscan_no_clustersr      s;    
 c!fqju=II!LFv;,,,r,   c                  .   t        dt        t              d      D ]s  } t        | d      j	                  t              }|D cg c]
  }|dk7  s	| }}t        |      dk7  sGt        j                  t        j                  |            | k\  rsJ  yc c}w )zb
    Test that the smallest non-noise cluster has at least `min_cluster_size`
    many points
    rt   r<   Fr   r   r   N)rI   r#   rC   r   rY   r9   minbincount)r   r'   r    true_labelss       r*   test_hdbscan_min_cluster_sizer      s    
 "!SVQ/ H*:GSSTUV*0@ERKu@@{q 66"++k237GGGG	H@s   
BBc                  z    t         j                  } t        | d      j                  t              }t        |       y)zA
    Tests that HDBSCAN works when passed a callable metric.
    FrU   N)r   	euclideanr   rY   rC   r+   )rV   r'   s     r*   test_hdbscan_callable_metricr      s.     FF/;;A>Fr,   treer   r   c                     t        d| d      }d}t        j                  t        |      5  |j	                  t
               ddd       y# 1 sw Y   yxY w)z
    Tests that HDBSCAN correctly raises an error when passing precomputed data
    while requesting a tree-based algorithm.
    rT   FrV   rq   r?   z%precomputed is not a valid metric forrW   N)r   rZ   r[   r\   rD   rC   )r   r   r_   s      r*   "test_hdbscan_precomputed_non_bruter      sE     $U
CC
1C	z	- 
  s   AAcsr_containerc                 B   t        d      j                  t              j                  }t	        |        | t              }|j                         }t        d      j                  |      j                  }t        ||       t        j                  dft        j                  dffD ]  \  }}t        j                         }||d<   t        d      j                  |      j                  }t	        |       |d   t        |   d   k(  sJ |j                         }||d<   t        d      j                  |      j                  }t        ||        d}t        j                  t        |	      5  t        d
dd      j                  |       ddd       y# 1 sw Y   yxY w)z
    Tests that HDBSCAN works correctly when passing sparse feature data.
    Evaluates correctness by comparing against the same data passed as a dense
    array.
    Fr>   r0   r1   r   r   r   r    z4Sparse data matrices only support algorithm `brute`.rW   r   r   r   N)r   rD   rC   rE   r+   r?   r   r9   rA   rB   r	   rZ   r[   r\   )	r   dense_labels	_X_sparseX_sparsesparse_labelsoutlier_valr-   X_denser_   s	            r*   test_hdbscan_sparser     sc    &**1-55L%a I~~H'++H5==M|]3 (*vvz&:RVVY<O%P 
8!\&&(#E*..w7??L)A"3L"A'"JJJJ>>#$U+//9AA<7
8 AC	z	- U{kFJJ8TU U Us   .FFrq   c                    ddg}t        dd|d      \  }}t        dd	      j                  |      }t        ||j                  |j
                        D ]$  \  }}}t        ||d
d       t        ||d
d       & t        | dt        j                  d   d      j                  t              }|j                  j                  d   dk(  sJ |j
                  j                  d   dk(  sJ y)zj
    Tests that HDBSCAN centers are calculated and stored properly, and are
    accurate to the data.
    )rd   rd   )      @r   i  r   r   )r   r   centerscluster_stdbothF)store_centersr?   r<   g?)rtolatol)rq   r   r   r?   N)	r
   r   rD   zip
centroids_medoids_r   rC   r|   )rq   r   H_r   centercentroidmedoids           r*   test_hdbscan_centersr   .  s     :&G1gSVWDAq
U
3
7
7
:C$'$N ; &qt<QT:;
 	
 
c!f  >>"a'''<<a A%%%r,   c                     t         j                  j                  d      } | j                  dd      }t	        ddddd	      j                  |      }t        j                  |d
      \  }}t        |      dk(  sJ ||dk(     dkD  sJ t	        dddddd      j                  |      }t        j                  |d
      \  }}t        |      dk(  sJ ||dk(     dk(  sJ y)zS
    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
    r      rt   r=   rd   eomTF)r   cluster_selection_epsiloncluster_selection_methodallow_single_clusterr?   )return_countsr      g
ףp=
?r   )r   r   r   r   rq   r?   N)r9   randomRandomStaterandr   rY   uniquer#   )rngno_structurer'   unique_labelscountss        r*   .test_hdbscan_allow_single_cluster_with_epsilonr   G  s    ))


"C88C#L"%!&! k,  IIfDAM6}""" -2%&+++ "&!&! k,  IIfDAM6}"""-2%&!+++r,   c                     ddgddgddgddgg} t        d| g dd      \  }}t        d	
      j                  |      j                  }t	        t        |            t        d|v       z
  }|dk(  sJ t        ||      dkD   y)z
    Validate that HDBSCAN can properly cluster this difficult synthetic
    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
    example)
    g333333g333333?r"   i  )皙?gffffff?皙?r   r   )r   r   r   r   Fr>   r      Gz?N)r
   r   rD   rE   r#   r$   intr   )r   rC   r&   r'   r)   s        r*   test_hdbscan_better_than_dbscanr   j  s     u~t}q!fq"g>G+	DAq % $$Q'//FS[!Cf$55J??&!$t+r,   z	kwargs, XrT   r<   rt   r"   r   c                 >    t        dddd|j                  |        y)zo
    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
    with non-finite points.
    r<   Fmin_samplesr?   Nr4   )r   rD   )rC   kwargss     r*   test_hdbscan_usable_inputsr   ~  s!     00044Q7r,   c                      | t        j                  d            }d}t        j                  t        |      5  t        dd      j                  |       ddd       y# 1 sw Y   yxY w)zd
    Tests that HDBSCAN raises the correct error when there are too few
    non-zero distances.
    )r   r   z#There exists points with fewer thanrW   rT   FrU   N)r9   zerosrZ   r[   r\   r   rD   r   rC   r_   s      r*   -test_hdbscan_sparse_distances_too_few_nonzeror     sT     	bhhx()A
/C	z	- 9}5155a89 9 9s   AA(c                 $   t        j                  d      }d|ddddf<   d|ddddf<   ||j                  z   } | |      }d}t        j                  t
        |      5  t        dd	
      j                  |       ddd       y# 1 sw Y   yxY w)zu
    Tests that HDBSCAN raises the correct error when the distance matrix
    has multiple connected components.
    )   r   r<   Nr=      z3HDBSCAN cannot be performed on a disconnected graphrW   rT   FrU   )r9   r   TrZ   r[   r\   r   rD   r   s      r*   0test_hdbscan_sparse_distances_disconnected_graphr     s     	AAbqb"1"fIAab"#gJ	ACCAaA
?C	z	- 9}5155a89 9 9s    BBc                     d } d}t        j                  t        |      5  t        d| d      j	                  t
               ddd       t        j                  t        |      5  t        d| d      j	                  t
               ddd       t        t        t        j                        t        t        j                        z
        }t        |      d	kD  rIt        j                  t        |      5  t        d|d	   d      j	                  t
               ddd       yy# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   yxY w)
zR
    Tests that HDBSCAN correctly raises an error for invalid metric choices.
    c                     | S r3   r4   )r6   s    r*   r7   z2test_hdbscan_tree_invalid_metric.<locals>.<lambda>  s     r,   zV.* is not a valid metric for a .*-based algorithm\. Please select a different metric\.rW   r   F)rq   rV   r?   Nr   r   )rZ   r[   r\   r   rD   rC   rH   r$   r   r   r   r#   )metric_callabler_   metrics_not_kds      r*    test_hdbscan_tree_invalid_metricr     s    "O	  
z	- P)O%HLLQOP	z	- R+oEJNNqQR
 #h445F<P<P8QQRN
>Q]]:S1 	Viq0ANRRSTU	V 	V P PR R	V 	Vs#   "D$&"D05%D<$D-0D9<Ec                      t        t        t              dz   d      } d}t        j                  t
        |      5  | j                  t               ddd       y# 1 sw Y   yxY w)zx
    Tests that HDBSCAN correctly raises an error when setting `min_samples`
    larger than the number of samples.
    r<   Fr   z min_samples (.*) must be at mostrW   N)r   r#   rC   rZ   r[   r\   rD   )r   r_   s     r*   !test_hdbscan_too_many_min_samplesr     sK    
 c!fqju
5C
-C	z	- 
  s   AA#c                      t         j                         } t        j                  | d<   d}t	        dd      }t        j                  t        |      5  |j                  |        ddd       y# 1 sw Y   yxY w)zu
    Tests that HDBSCAN correctly raises an error when providing precomputed
    distances with `np.nan` values.
    r   z(np.nan values found in precomputed-denserT   FrU   rW   N)	rC   r?   r9   rB   r   rZ   r[   r\   rD   )X_nanr_   r   s      r*   "test_hdbscan_precomputed_dense_nanr     s[    
 FFHE&&E$K
4C
U
3C	z	-   s   A--A6r   TFepsilonr   c                 T   d}t        || ddgddgddgg      \  }}t        d      j                  |      }t        |j                  |j
                        }|dz   |d	z   |d
z   h}|dz   d|d	z   d|d
z   di}	t        |||	||      }
t        t        |            D ci c]!  }|t        j                  ||k(        d   d   # }}t        t        |            D ci c]  }||
||       }} t        j                  |j                        |      }t        |
|       yc c}w c c}w )zR
    Tests that the `_do_labelling` helper function correctly assigns labels.
    0   r   r   )r   r   Fr>   r   rt   r"   r   r<   condensed_treeclusterscluster_label_mapr   r   N)r
   r   rD   r   _single_linkage_tree_r   r   rH   r$   r9   where	vectorizer~   r   )global_random_seedr   r   r   rC   r&   estr   r   r   r'   _yfirst_with_labely_to_labelsaligned_targets                  r*   test_labelling_distinctr    sO    I' FGG
		DAq u

!
!!
$C#!!C4H4HN Ay1}i!m<H"Q9q=!Y]AN%+1")F ?C3q6lKBHHQ"W-a033KK>B3q6lK2v.r233KKK2R\\+//215Nv~. LKs   &D D%c                  L   d} d}t        j                  dd|dfddd|dfddgt        	      }t        || h| d| dz   did
d      }|d   dk  }t	        |      t	        |dk(        k(  sJ t        || h| d| dz   did
d      }|d   |k  }t	        |      t	        |dk(        k(  sJ y)z
    Tests that the `_do_labelling` helper function correctly thresholds the
    incoming lambda values given various `cluster_selection_epsilon` values.
    r=   g      ?rt   r<   )r=   r<   r   r<   r   )r=   r"   r   r<   )r=   r   r   r<   )dtypeTr   valuer   N)r9   arrayr   r   sum)r   
MAX_LAMBDAr   r'   	num_noises        r*   test_labelling_thresholdingr    s    
 IJXX:q!:q!	
 	N %$aQ:!"#F w'!+Iy>S2....%$aQ:!"#F w'*4Iy>S2....r,   r   r   r   c                    t         j                  j                  d      }|j                  d      }t        |      }d}t	        j
                  t        |      5  t        d| d      j                  |       ddd       y# 1 sw Y   yxY w)	zCheck that we raise an error if the centers are requested together with
    a precomputed input matrix.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27893
    r   d   rt   z>Cannot store centers when using a precomputed distance matrix.rW   rT   F)rV   r   r?   N)	r9   r   r   r   rZ   r[   r\   r   rD   )r   r   rC   X_disterr_msgs        r*   0test_hdbscan_error_precomputed_and_store_centersr  +  st     ))


"C

8A #FNG	z	1  '	
 #f+  s   B  B	
valid_algor   r   c                 F    t        d| d      j                  t               y)zTest that HDBSCAN works with the "cosine" metric when the algorithm is set
    to "brute" or "auto".

    Non-regression test for issue #28631
    cosineFr   N)r   rY   rC   )r  s    r*   *test_hdbscan_cosine_metric_valid_algorithmr  ?  s     8z>JJ1Mr,   invalid_algoc                     t        d| d      }t        j                  t        d      5  |j	                  t
               ddd       y# 1 sw Y   yxY w)zTest that HDBSCAN raises an informative error is raised when an unsupported
    algorithm is used with the "cosine" metric.
    r  Fr   zcosine is not a valid metricrW   N)r   rZ   r[   r\   rY   rC   )r  hdbscans     r*   ,test_hdbscan_cosine_metric_invalid_algorithmr  I  sD    
 XEJG	z)G	H A  s   A		Ac                      t         j                  j                  d      j                  d      } d}t        j                  t
        |      5  t        d      }|j                  |        ddd       y# 1 sw Y   yxY w)z\
    Test that HDBSCAN raises a FutureWarning when the `copy`
    parameter is not set.
    r   r  zCThe default value of `copy` will change from False to True in 1.10.rW   r   r   N)r9   r   r   rZ   r   r   r   rD   )rC   r_   r   s      r*   !test_hdbscan_default_copy_warningr  T  s`    
 			a ''1A
PC	m3	/ r*
  s   A33A<)r   )K__doc__numpyr9   rZ   scipyr   scipy.spatialr   sklearn.clusterr   sklearn.cluster._hdbscan._treer   r   r    sklearn.cluster._hdbscan.hdbscanr	   sklearn.datasetsr
   sklearn.metricsr   sklearn.metrics.pairwiser   r   sklearn.neighborsr   r   sklearn.preprocessingr   sklearn.utilsr   sklearn.utils._testingr   r   sklearn.utils.fixesr   r   rC   r&   fit_transform
ALGORITHMSitemsr%   r+   markparametrizerR   r`   rk   rn   r   r   r   r   r   r   r   r   r   r   r   r   r	  rA   r   r   r   r   r   r   r  r  r  r  r  r  )r   outs   00r*   <module>r3     s  
    " # 
 ? ' 1 H . 0 ! F >Cb11q!!$1""1%
 d1H1B1H1H1JKvq#c'lKK8 ):;J <J>A0 -/Q/Q./QR  S "	  ,>2% 3 -%P
0 78 884 -	H  )[!9: ; .9U :UD j1& 2&0 ,F,( 
M	"HBHHq"&&kBFFA;-G$HI
M	"aVaV$45	q!fq!f88 .9	9 :	9 .99 :9 V0
 /$?QH-!/ . @!/H&/R :x*@A B& '89N :N )[)AB C	] Ls   0M!