U
    /j                     @  s  U d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZ edZe  Z!e!"e#d g Z$de%d< g Z&de%d< eD ]HZ'z"ee're$(e' n
e&(e' W q e)k
r"   e&(e' Y qX qe$e& Z*de%d< d,ddddddddddddddZ+d-d ddddddddddd!d"d#Z,d.d$ddddddddddd%d&d'Z-d/d(ddddddddddd)d*d+Z.dS )0    )annotationsN)PathLike)BinaryIO   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDIANA_SUPPORTED_SIMILARTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_multi_byte_encodingshould_strip_sig_or_bomZcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)sz	list[str]_mb_supported_sb_supportedIANA_SUPPORTED_MB_FIRST      皙?TF皙?zbytes | bytearrayintfloatzlist[str] | Noneboolr   )	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainlanguage_thresholdenable_fallbackreturnc
           @      C  s  t | ttfs tdt| |r>tj}
tt	 t
t t| }|dkrtd |rptt	 t
|
 tt| dddg dgS |dk	rttd	d
| dd |D }ng }|dk	rttdd
| dd |D }ng }||| krttd||| d}|}|dkr4|| |k r4t|| }t| tk }t| tk}|rfttd| n|r~ttd| g }|rt| nd}|dk	r|| ttd| t }g }g }t }t }i }d}t }d}d}d}d}d}d}t }t }t| \} }!| dk	r6||  ttdt|!|  |d d|krT|d |t D 
]}"|rv|"|krvq\|r|"|krq\|"|krq\||" d}#| |"k}$|$ot|"}%|"dkr|$sttd|" q\|"dkr |$s ttd|" q\|"|krttd|" q\|"|kr8ttd|" q\zt|"}&W n. ttfk
rr   ttd|" Y q\Y nX |r|&stt |"}'ntt!|"}'|'"|sttd|"|'| q\|r|&s||krttd|"|| q\|r|&sttd |" q\z|rP|&dkrPt#|%dkr4| dtd! n| t|!td! |"d" nd|"d#kr|$rt#| |"d"}#|#r|#d d$kr|#dd }#n&t#|%dkr| n| t|!d |"d"}#W n\ t$t%fk
r }( z8t |(t%sttd%|"t#|( ||" W Y q\W 5 d}(~(X Y nX t&|$s dnt|!|t|| })|&oP|#dk	oPt|#|k }*|*rfttd&|" |#dk	rP|&sPt'|#}+|(|+},|,dk	rP|,\}-}.}/|/rt| |"|-|$|.|dks|"|ddfkr|#nd|d'}0||0 ||" ttd(|"t)|-d) d*d+ |"|ddfkrj|-d,k rj|-dkr`td-|0j* |rRtt	 t
|
 t|0g  S ||0 t|r\|dks||kr\d|kr\d|kr\|+ }1td-|1j* |rtt	 t
|
 t|1g  S q\np||" ttd.|" |	r\|"dd|d/d0fkr\t| |"||$g |#|d'}2|"|kr8|2}n|"dkrH|2}n|2}q\tt|)d1 }3t,|3d2}3d}4d}5g }6g }7zt-| |"|)||$|%|!|&|#	D ]|}8|6|8 |7t.|8||d3kodt|  kod2kn   |7d4 |kr|4d7 }4|4|3ks|$r|%dkr qqW n@ t$k
rT }( z ttd5|"t#|( |3}4d3}5W 5 d}(~(X Y nX |5s|r|&sz| td6d j/|"d7d8 W nL t$k
r }( z,ttd9|"t#|( ||" W Y q\W 5 d}(~(X Y nX |7rt0|7t|7 nd}9|9|k	s|4|3k	r||" |"t1k	r$|2t1|"  |#dk		rJ|&	sJ|3t'|#|9g df ttd:|"|4t)|9d) d*d+ |	r\|"dd|d/d0fkr\|5s\t| |"||$g |#|d'}2|"|k	r|2}n|"dk	r|2}n|2}q\ttd;|"t)|9d) d*d+ |&	st |"}:nt!|"}:|:
rttd<|"t#|: g };|"dk
rb|6D ],}8t4|8||:
rDd=|:nd}<|;|< 
q*t5|;}=nt5|;}=|=
rttd>|=|" t| |"|9|$|=|dk
s|"|ddfk
r|#nd|d'}>||> |#dk	
r|&
s|3t'|#|9|=d3f |r|&s|9d?k r|d7 }|"|ddfkrj|9d,k rj|9dkr`td-|>j* |rRtt	 t
|
 t|>g  S ||> t|r|dks||krd|krd|kr|+ }1td-|1j* |rtt	 t
|
 t|1g  S |sD|&sD|=rt,d@dA |=D ddBnd}?|?dCkrDd|krDd|krDd3}|2|: ttdD|"|9|? |s|&r|*r|#dk	rt|#|dE k r|"dFkrd|krd|krd3}ttdG|"|9t|#|t|#| d)  |"| kr\tdH|" |rtt	 t
|
 t||" g  S q\t|dkr|s |s |r,ttdI |rLtdJ|j* || nd|r\|dks|rv|rv|j6|j6ks|dk	rtdK || n|rtdL || |rtdM|+ j*t|d  n
tdN |rtt	 t
|
 |S )Oaf  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z3Expected object of type bytes or bytearray, got: {}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S  s   g | ]}t |d qS Fr   .0cp r4   :/tmp/pip-unpacked-wheel-s9saxz3h/charset_normalizer/api.py
<listcomp>s   s     zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S  s   g | ]}t |d qS r/   r0   r1   r4   r4   r5   r6   ~   s     z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.   zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_32utf_16z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>   utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.zY%s is deemed too similar to a code page that was already considered unsuited. Continuing!zESkipping %s: already fast-tracked from a similar successful encoding.z2Encoding %s does not provide an IncrementalDecoderzbSkipping %s: definitive match already found, this encoding targets different languages (%s vs %s).zXSkipping %s: already accumulated %d same-family results after definitive match (cap=%d).zCSkipping single-byte %s: multi-byte definitive match already found.g    A)encodingr;   u   ﻿z9Code page %s does not fit given bytes sequence at ALL. %szpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.)Zpreemptive_declarationzM%s fast-tracked (identical decoded payload to a prior encoding, chaos=%f %%).d      )ndigitsr   z.Encoding detection: %s is most likely the one.zZ%s fast-skipped (identical decoded payload to a prior encoding that failed chaos probing).r:   r9         TzaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.z=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}g{Gz?c                 s  s   | ]\}}|V  qd S )Nr4   )r2   _vr4   r4   r5   	<genexpr>  s     zfrom_bytes.<locals>.<genexpr>)defaultg      ?zyDefinitive match found: %s (chaos=%.3f, coherence=%.2f). Encodings targeting different language families will be skipped.g\(\?>	   	utf_8_sig	utf_16_ler9   	utf_16_be	utf_32_ler-   r:   r;   	utf_32_bezjMulti-byte definitive match: %s (chaos=%.3f, decoded=%d/%d=%.1f%%). Single-byte encodings will be skipped.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)7
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerr   r   logjoinr   r   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorr   r   intersectionstrUnicodeDecodeErrorLookupErrorrangehashgetroundr<   bestmaxr   r   decodesumr   update
setdefaultr   r	   fingerprint)@r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   Zprevious_logger_levellengthZis_too_small_sequenceZis_too_large_sequenceZprioritized_encodingsZspecified_encodingZtestedZtested_but_hard_failureZtested_but_soft_failureZsoft_failure_skipZsuccess_fast_trackedZpayload_result_cacheZdefinitive_match_foundZdefinitive_target_languagesZ post_definitive_sb_success_countZPOST_DEFINITIVE_SB_CAPZmb_definitive_match_foundZfallback_asciiZfallback_u8Zfallback_specifiedresultsZearly_stop_resultsZsig_encodingZsig_payloadZencoding_ianaZdecoded_payloadZbom_or_sig_availableZstrip_sig_or_bomZis_multi_byte_decoderZenc_languageseZr_Zmulti_byte_bonusZpayload_hashcachedZcached_messZ	cached_cdZcached_passedZ
fast_matchZprobable_resultZfallback_entryZmax_chunk_gave_upZearly_stop_countZlazy_str_hard_failureZ	md_chunksZ	md_ratioschunkZmean_mess_ratioZtarget_languagesZ	cd_ratiosZchunk_languagesZcd_ratios_mergedZcurrent_matchZbest_coherencer4   r4   r5   
from_bytes9   s   






	

























	



&



 
	


 

 






	





	






rx   r   )fpr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   c
           
      C  s   t |  |||||||||	
S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )rx   read)
ry   r#   r$   r%   r&   r'   r(   r)   r*   r+   r4   r4   r5   from_fp`  s    r{   zstr | bytes | PathLike)pathr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   c
                 C  s<   t | d(}
t|
|||||||||	
W  5 Q R  S Q R X dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr{   )r|   r#   r$   r%   r&   r'   r(   r)   r*   r+   ry   r4   r4   r5   	from_path~  s    r   z!PathLike | str | BinaryIO | bytes)fp_or_path_or_payloadr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   c
                 C  sz   t | ttfr,t| |||||||||	d
}
nHt | ttfrXt| |||||||||	d
}
nt| |||||||||	d
}
|
 S )a)  
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    )	r#   r$   r%   r&   r'   r(   r)   r*   r+   )rO   re   r   r   rQ   rP   rx   r{   )r   r#   r$   r%   r&   r'   r(   r)   r*   r+   Zguessesr4   r4   r5   	is_binary  sX    r   )	r   r   r   NNTFr   T)	r   r   r   NNTFr   T)	r   r   r   NNTFr   T)	r   r   r   NNTFr   F)/
__future__r   loggingosr   typingr   Zcdr   r   r   r	   Zconstantr
   r   r   r   r   Zmdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerrU   StreamHandlerrX   setFormatter	Formatterr   __annotations__r   Z_supported_encr_   rc   r   rx   r{   r   r   r4   r4   r4   r5   <module>   s     	
	         $      /         $          $!         