U
    /j/                     @  s  U d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ eeddddddZ%eeddddddZ&eeddddddZ'e(dd e) D Z*de+d< dd e*D Z,de+d< eeddddd d!Z-eeddddd"d#Z.eeddddd$d%Z/eeddddd&d'Z0eeddddd(d)Z1eeddddd*d+Z2eeddddd,d-Z3eeddddd.d/Z4eeddddd0d1Z5eeddddd2d3Z6eeddddd4d5Z7eeddddd6d7Z8eeddddd8d9Z9eeddddd:d;Z:eeddddd<d=Z;ee<edddd>d?d@Z=eedddddAdBZ>didDdddEdFdGZ?edHddddIdJdKZ@dDdLdMdNdOZAdddPdQdRZBdjddddTdUdVZCdddWdXdYdZZDddddXd[d\ZEd]ejFd^fdddd_d`dadbZGdkdDddcddddddddedf
dgdhZHdS )l    )annotationsN)bisect_right)IncrementalDecoder)aliases)	lru_cache)findall)	Generator)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATIONCOMMON_CJK_CHARACTERS_LATIN_CJK_HANGUL	_KATAKANA	_HIRAGANA_THAI_ARABIC_ARABIC_ISOLATED_FORM_ACCENT_KEYWORDS_ACCENTUATED)maxsizestrint)	characterreturnc                 C  s   zt | }W n tk
r$   Y dS X d}d|kr:|tO }d|krJ|tO }d|krZ|tO }d|krj|tO }d|krz|tO }d|kr|tO }d|kr|t	O }d	|kr|t
O }tD ]}||kr|tO } qq|S )
zRCompute all name-based classification flags with a single unicodedata.name() call.r   ZLATINCJKZHANGULZKATAKANAZHIRAGANAZTHAIZARABICzISOLATED FORM)unicodedataname
ValueErrorr   r   r   r   r   r   r   r   r   r   )r   descflagskw r(   </tmp/pip-unpacked-wheel-s9saxz3h/charset_normalizer/utils.py_character_flags&   s4    r*   boolc                 C  s   t t| t@ S N)r+   r*   r   r   r(   r(   r)   is_accentuatedI   s    r.   c                 C  s.   t | }|s| S |d}tt|d dS )N r      )r"   decompositionsplitchrr   )r   Z
decomposedcodesr(   r(   r)   remove_accentN   s
    

r5   c                 c  s    | ]\}}|j |j|fV  qd S r,   )startstop).0r#   Z	ord_ranger(   r(   r)   	<genexpr>[   s   r9   zlist[tuple[int, int, str]]_UNICODE_RANGES_SORTEDc                 C  s   g | ]}|d  qS )r   r(   )r8   er(   r(   r)   
<listcomp>_   s     r<   z	list[int]_UNICODE_RANGE_STARTSz
str | Nonec                 C  s<   t | }tt|d }|dkr8t| \}}}||k r8|S dS )zK
    Retrieve the Unicode range official name from a single character.
    r
   r   N)ordr   r=   r:   )r   Zcharacter_ordidxr6   r7   r#   r(   r(   r)   unicode_rangeb   s    r@   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   is_latins   s    rA   c                 C  s2   t | }d|krdS t| }|d kr*dS d|kS )NPTFZPunctuationr"   categoryr@   r   character_categorycharacter_ranger(   r(   r)   is_punctuationx   s    
rH   c                 C  sB   t | }d|ksd|krdS t| }|d kr2dS d|ko@|dkS )NSNTFZFormsZLorC   rE   r(   r(   r)   	is_symbol   s    
rK   c                 C  s$   t | }|d krdS d|kp"d|kS )NFZ	EmoticonsZPictographs)r@   )r   rG   r(   r(   r)   is_emoticon   s    rL   c                 C  s.   |   s| dkrdS t| }d|kp,|dkS )N>   +   ｜<>TZ>   PoPdPc)isspacer"   rD   )r   rF   r(   r(   r)   is_separator   s    
rV   c                 C  s   |   |  kS r,   )islowerisupperr-   r(   r(   r)   is_case_variable   s    rY   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   is_cjk   s    rZ   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   is_hiragana   s    r[   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   is_katakana   s    r\   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   	is_hangul   s    r]   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   is_thai   s    r^   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   	is_arabic   s    r_   c                 C  s   t t| t@ S r,   )r+   r*   r   r-   r(   r(   r)   is_arabic_isolated_form   s    r`   c                 C  s   | t kS r,   )r   r-   r(   r(   r)   is_cjk_uncommon   s    ra   )
range_namer    c                   s   t  fddtD S )Nc                 3  s   | ]}| kV  qd S r,   r(   )r8   keywordrb   r(   r)   r9      s     z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   rd   r(   rd   r)   is_unicode_range_secondary   s    rf   c                 C  s(   |   dko&|  dko&| dko&| dkS )NFu   ﻿)rU   isprintabler-   r(   r(   r)   is_unprintable   s    
ri       zbytes | bytearray)sequencesearch_zoner    c                 C  s   t | ttfstt| }tt| dt|| jddd}t|dkrLdS |D ]N}|	 
dd}t D ]0\}}||kr|    S ||krl|    S qlqPdS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nasciiignoreerrorsr   -_)
isinstancebytes	bytearray	TypeErrorlenr   r   mindecodelowerreplacer   items)rk   rl   Zseq_lenresultsZspecified_encodingencoding_aliasencoding_ianar(   r(   r)   any_specified_encoding   s"    r      )r#   r    c                 C  s    | dkpt td|  jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   	utf_8_sig	utf_16_leutf_32	utf_16_be	utf_32_leutf_8utf_16utf_7	utf_32_be
encodings.)
issubclass	importlibimport_moduler   r	   )r#   r(   r(   r)   is_multi_byte_encoding	  s    
r   ztuple[str | None, bytes])rk   r    c                 C  sJ   t D ]@}t | }t|tr |g}|D ]}| |r$||f    S q$qdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r   rs   rt   
startswith)rk   iana_encodingZmarksmarkr(   r(   r)   identify_sig_or_bom  s    

r   )r   r    c                 C  s   | dkS )N>   r   r   r(   )r   r(   r(   r)   should_strip_sig_or_bom0  s    r   T)cp_namestrictr    c                 C  sN   |   dd} t D ]\}}| ||fkr|  S q|rJtd|  d| S )zIReturns the Python normalized encoding name (Not the IANA official name).rq   rr   zUnable to retrieve IANA for '')rz   r{   r   r|   r$   )r   r   r~   r   r(   r(   r)   	iana_name4  s    
r   float)iana_name_aiana_name_br    c           	      C  s   t | st |rdS td|  j}td| j}|dd}|dd}d}tdD ]*}t|g}||||krX|d7 }qX|d S )Ng        r   rn   ro   r      r
   )r   r   r   r   rangert   ry   )	r   r   Z	decoder_aZ	decoder_bZid_aZid_bZcharacter_match_countiZto_be_decodedr(   r(   r)   cp_similarityE  s    



r   c                 C  s   | t ko|t |  kS )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r(   r(   r)   is_cp_similarY  s    
r   Zcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)sNone)r#   levelformat_stringr    c                 C  s:   t | }|| t  }|t | || d S r,   )logging	getLoggersetLevelStreamHandlersetFormatter	Formatter
addHandler)r#   r   r   loggerhandlerr(   r(   r)   set_logging_handlerd  s
    

r   r   rt   zGenerator[str, None, None])
	sequencesr   offsets
chunk_sizebom_or_sig_availablestrip_sig_or_bomsig_payloadis_multi_byte_decoderdecoded_payloadr    c	                 c  s*  |r6|dkr6|D ]"}	||	|	|  }
|
s, q4|
V  qn|D ]}	|	| }|t | d krXq:| |	|	|  }|r||dkr||| }|j||rdndd}
|r|	dkrt|d}|r|
d | |krt|	|	d d	D ]H}| || }|r|dkr|| }|j|dd}
|
d | |kr qq|
V  q:d S )
NF   rn   r   ro   r   r0      )rw   ry   rx   r   )r   r   r   r   r   r   r   r   r   r   chunkZ	chunk_endZcut_sequenceZchunk_partial_size_chkjr(   r(   r)   cut_sequence_chunksq  s>    


r   )rj   )T)N)I
__future__r   r   r   r"   bisectr   codecsr   Zencodings.aliasesr   	functoolsr   rer   typingr   Z_multibytecodecr	   Zconstantr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r.   r5   sortedr|   r:   __annotations__r=   r@   rA   rH   rK   rL   rV   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rw   rf   ri   r   r   r   r   r   r   r   INFOr   r   r(   r(   r(   r)   <module>   s    L"		 " 