U
    ×Ä/jév  ã                   @  s  U d dl mZ d dlZd dlmZ d dlmZ ejdkrFd dlm	Z	 n.zd dl
m	Z	 W n ek
rr   dd„ Z	Y nX d	d
lmZmZmZmZmZmZmZmZmZmZmZmZmZ d	dlmZmZmZmZmZm Z m!Z! eeB eB eB eB Z"de#d< e	G dd„ dƒƒZ$G dd„ dƒZ%e	G dd„ de%ƒƒZ&e	G dd„ de%ƒƒZ'e	G dd„ de%ƒƒZ(e	G dd„ de%ƒƒZ)e	G dd„ de%ƒƒZ*e	G dd„ de%ƒƒZ+e	G dd„ de%ƒƒZ,e	G d d!„ d!e%ƒƒZ-e	G d"d#„ d#e%ƒƒZ.ed$d%d&d&d'd(œd)d*„ƒZ/ed+d%d3d.d/d'd/d0œd1d2„ƒZ0dS )4é    )ÚannotationsN)Ú	lru_cache)Ú	getLogger)é   é   )Úfinalc                 C  s   | S )N© )Úclsr   r   ú9/tmp/pip-unpacked-wheel-s9saxz3h/charset_normalizer/md.pyr      s    r   é   )ÚCOMMON_CJK_CHARACTERSÚCOMMON_SAFE_ASCII_CHARACTERSÚTRACEÚUNICODE_SECONDARY_RANGE_KEYWORDÚ_ACCENTUATEDÚ_ARABICÚ_ARABIC_ISOLATED_FORMÚ_CJKÚ_HANGULÚ	_HIRAGANAÚ	_KATAKANAÚ_LATINÚ_THAI)Ú_character_flagsÚis_emoticonÚis_punctuationÚis_separatorÚ	is_symbolÚremove_accentÚunicode_rangeÚintÚ_GLYPH_MASKc                   @  s2   e Zd ZdZdZddœdd„Zdddœd	d
„ZdS )ÚCharInfou{  Pre-computed character properties shared across all detectors.

    Instantiated once and reused via :meth:`update` on every character
    in the hot loop so that redundant calls to str methods
    (``isalpha``, ``isupper``, â€¦) and cached utility functions
    (``_character_flags``, ``is_punctuation``, â€¦) are avoided when
    several plugins need the same information.
    ©Ú	characterÚ	printableÚalphaÚupperÚlowerÚspaceÚdigitÚis_asciiÚcase_variableÚflagsÚaccentuatedÚlatinÚis_cjkÚ	is_arabicÚis_glyphÚpunctÚsymÚNone©Úreturnc                 C  sj   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d| _d| _d| _d| _d S )NÚ Fr   r#   ©Úselfr   r   r
   Ú__init__N   s"    zCharInfo.__init__Ústr)r$   r7   c                 C  s
  || _ t|ƒ}|dk r(d| _d| _d| _d| _d| _d|  krJdkr”n nFd| _d| _d| _	d| _
d| _d| _d| _t| _d| _d| _d| _qd|  kr¨dkròn nFd| _d| _d| _	d| _
d| _d| _d| _t| _d| _d| _d| _qd|  kr
d	krRn nDd| _d| _d| _	d| _
d| _d| _d| _d
| _d| _d| _d| _nÔ|dksxd|  krtdkrÀn nHd| _d| _d| _	d| _
d| _|dk| _d| _d
| _d| _d| _d| _nf| ¡ | _d| _d| _d| _	d| _
d| _d| _d
| _d| _| jr
t|ƒnd| _| jr t|ƒnd| _nÞd| _| ¡ | _| ¡ | _| ¡ | _| ¡ | _	| ¡ | _
| ¡ | _| j	| jk| _| jrŠt|ƒ}nd
}|| _t|t@ ƒ| _t|t@ ƒ| _t|t@ ƒ| _t|t@ ƒ| _t|t @ ƒ| _| jrêt|ƒnd| _| jr t|ƒnd| _dS )zBUpdate all properties for *character* (called once per character).é€   TFéA   éZ   éa   éz   é0   é9   r   é    é	   é   N)!r$   Úordr+   r.   r0   r1   r2   r&   r'   r(   r)   r*   r%   r,   r   r-   r/   r3   r4   Úisprintabler   r   ÚisalphaÚisupperÚislowerÚisspaceÚisdigitr   Úboolr   r   r   r!   )r:   r$   Úor-   r   r   r
   Úupdatea   s¬    


&








zCharInfo.updateN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú	__slots__r;   rP   r   r   r   r
   r"   /   s   	r"   c                   @  sF   e Zd ZdZdZddddœdd„Zdd	œd
d„Zedd	œdd„ƒZdS )ÚMessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    r   r<   r"   r5   ©r$   Úinfor7   c                 C  s   t ‚dS )z‰
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        N©ÚNotImplementedError©r:   r$   rX   r   r   r
   Ú	feed_infoÖ   s    zMessDetectorPlugin.feed_infor6   c                 C  s   t ‚dS )zB
        Permit to reset the plugin to the initial state.
        NrY   r9   r   r   r
   ÚresetÝ   s    zMessDetectorPlugin.resetÚfloatc                 C  s   t ‚dS )z…
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        NrY   r9   r   r   r
   Úratioã   s    zMessDetectorPlugin.ratioN)	rQ   rR   rS   rT   rU   r\   r]   Úpropertyr_   r   r   r   r
   rV   Î   s   rV   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )Ú TooManySymbolOrPunctuationPlugin©Ú_punctuation_countÚ_symbol_countÚ_character_countÚ_last_printable_charZ_frenzy_symbol_in_wordr5   r6   c                 C  s"   d| _ d| _d| _d | _d| _d S ©Nr   Frb   r9   r   r   r
   r;   ö   s
    z)TooManySymbolOrPunctuationPlugin.__init__r<   r"   rW   c                 C  sb   |  j d7  _ || jkrX|tkrX|jr6|  jd7  _n"|jsX|jrXt|ƒsX|  jd7  _|| _dS )ú1Optimized feed using pre-computed character info.r   é   N)	re   rf   r   r3   rc   r*   r4   r   rd   r[   r   r   r
   r\   þ   s    ÿþz*TooManySymbolOrPunctuationPlugin.feed_infoc                 C  s   d| _ d| _d| _d S ©Nr   )rc   re   rd   r9   r   r   r
   r]     s    z&TooManySymbolOrPunctuationPlugin.resetr^   c                 C  s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr   ç        ç333333Ó?)re   rc   rd   )r:   Zratio_of_punctuationr   r   r
   r_     s    

þz&TooManySymbolOrPunctuationPlugin.ratioN©	rQ   rR   rS   rU   r;   r\   r]   r`   r_   r   r   r   r
   ra   ì   s   ra   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )ÚTooManyAccentuatedPlugin©re   Ú_accentuated_countr5   r6   c                 C  s   d| _ d| _d S rj   ro   r9   r   r   r
   r;   "  s    z!TooManyAccentuatedPlugin.__init__r<   r"   rW   c                 C  s&   |  j d7  _ |jr"|  jd7  _dS ©rh   r   N)re   r.   rp   r[   r   r   r
   r\   &  s    z"TooManyAccentuatedPlugin.feed_infoc                 C  s   d| _ d| _d S rj   ro   r9   r   r   r
   r]   -  s    zTooManyAccentuatedPlugin.resetr^   c                 C  s*   | j dk rdS | j| j  }|dkr&|S dS )Nr   rk   gffffffÖ?ro   )r:   Zratio_of_accentuationr   r   r
   r_   1  s    
zTooManyAccentuatedPlugin.ratioNrm   r   r   r   r
   rn     s   rn   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )ÚUnprintablePlugin©Ú_unprintable_countre   r5   r6   c                 C  s   d| _ d| _d S rj   rs   r9   r   r   r
   r;   >  s    zUnprintablePlugin.__init__r<   r"   rW   c                 C  s<   |j s*|js*|dkr*|dkr*|  jd7  _|  jd7  _dS )rh   úu   ï»¿r   N)r)   r%   rt   re   r[   r   r   r
   r\   B  s    ÿþýüzUnprintablePlugin.feed_infoc                 C  s
   d| _ d S rj   )rt   r9   r   r   r
   r]   M  s    zUnprintablePlugin.resetr^   c                 C  s   | j dkrdS | jd | j  S )Nr   rk   r   )re   rt   r9   r   r   r
   r_   P  s    
zUnprintablePlugin.ratioNrm   r   r   r   r
   rr   :  s   rr   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )ÚSuspiciousDuplicateAccentPlugin©Ú_successive_countre   Ú_last_latin_characterÚ_last_was_accentuatedr5   r6   c                 C  s   d| _ d| _d | _d| _d S rg   rw   r9   r   r   r
   r;   a  s    z(SuspiciousDuplicateAccentPlugin.__init__r<   r"   rW   c                 C  st   |  j d7  _ | jdk	rb|jrb| jrb|jrB| j ¡ rB|  jd7  _t|ƒt| jƒkrb|  jd7  _|| _|j| _dS rq   )re   ry   r.   rz   r'   rJ   rx   r   r[   r   r   r
   r\   h  s    ÿþýz)SuspiciousDuplicateAccentPlugin.feed_infoc                 C  s   d| _ d| _d | _d| _d S rg   rw   r9   r   r   r
   r]   w  s    z%SuspiciousDuplicateAccentPlugin.resetr^   c                 C  s   | j dkrdS | jd | j  S )Nr   rk   ri   )re   rx   r9   r   r   r
   r_   }  s    
z%SuspiciousDuplicateAccentPlugin.ratioNrm   r   r   r   r
   rv   X  s   rv   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )ÚSuspiciousRange©Ú"_suspicious_successive_range_countre   Ú_last_printable_seenÚ_last_printable_ranger5   r6   c                 C  s   d| _ d| _d | _d | _d S rj   r|   r9   r   r   r
   r;   Ž  s    zSuspiciousRange.__init__r<   r"   rW   c                 C  s†   |  j d7  _ |js"|js"|tkr2d| _d| _dS | jdkrP|| _t|ƒ| _dS | j}t|ƒ}t||ƒrv|  jd7  _|| _|| _dS rq   )	re   r)   r3   r   r~   r   r   Ú is_suspiciously_successive_ranger}   )r:   r$   rX   Úunicode_range_aÚunicode_range_br   r   r
   r\   ”  s    


zSuspiciousRange.feed_infoc                 C  s   d| _ d| _d | _d | _d S rj   )re   r}   r~   r   r9   r   r   r
   r]   «  s    zSuspiciousRange.resetr^   c                 C  s"   | j dkrdS | jd | j  }|S )NrF   rk   ri   )re   r}   )r:   Zratio_of_suspicious_range_usager   r   r
   r_   ±  s    
þzSuspiciousRange.ratioNrm   r   r   r   r
   r{   …  s   r{   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )ÚSuperWeirdWordPlugin©Ú_word_countÚ_bad_word_countÚ_foreign_long_countÚ_is_current_word_badÚ_foreign_long_watchre   Ú_bad_character_countÚ_buffer_lengthÚ_buffer_last_charÚ_buffer_last_char_accentuatedÚ_buffer_accent_countÚ_buffer_glyph_countÚ_buffer_upper_countr5   r6   c                 C  sR   d| _ d| _d| _d| _d| _d| _d| _d| _d | _d| _	d| _
d| _d| _d S rg   r„   r9   r   r   r
   r;   Ï  s    zSuperWeirdWordPlugin.__init__r<   r"   rW   c                 C  s  |j r€|  jd7  _|| _|jr.|  jd7  _|j| _|jrJ|  jd7  _| jsh|j	r\|jrh|j
shd| _|j
r||  jd7  _dS | jsŠdS |js |js t|ƒrÔ|  jd7  _| j}|  j|7  _|dkr8| j| dkrâd| _nV| jr| j ¡ r| j|kr|  jd7  _d| _n | jdkr8d| _|  jd7  _|dkr~| jr~| jdkob| j| dk}|s~|  jd7  _d| _| jr¨|  jd7  _|  j|7  _d	| _d	| _d| _d| _d	| _d| _d| _d| _n:|d
kr|js|jrd| _|  jd7  _|| _d	| _dS )rh   r   TNé   ç      à?é   r   rl   F>   ú>ú<Ú_ú=ú-ú~ú|)r&   r‹   rŒ   r'   r   r.   r   rŽ   r‰   r/   r2   r   r)   r3   r   r…   re   rˆ   rJ   r‡   r†   rŠ   r*   r4   )r:   r$   rX   Zbuffer_lengthZprobable_camel_casedr   r   r
   r\   á  sˆ    ÿþþý
ÿþýþÿþýzSuperWeirdWordPlugin.feed_infoc                 C  sR   d| _ d | _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d S rg   )r‹   rŒ   r   rˆ   r‰   r†   r…   re   rŠ   r‡   rŽ   r   r   r9   r   r   r
   r]   ,  s    zSuperWeirdWordPlugin.resetr^   c                 C  s$   | j dkr| jdkrdS | j| j S )Né
   r   rk   )r…   r‡   rŠ   re   r9   r   r   r
   r_   ;  s    zSuperWeirdWordPlugin.ratioNrm   r   r   r   r
   rƒ   ½  s   Krƒ   c                   @  sT   e Zd ZdZdZddœdd„Zdddd	œd
d„Zddœdd„Zeddœdd„ƒZ	dS )ÚCjkUncommonPluginz<
    Detect messy CJK text that probably means nothing.
    ©re   Ú_uncommon_countr5   r6   c                 C  s   d| _ d| _d S rj   r   r9   r   r   r
   r;   K  s    zCjkUncommonPlugin.__init__r<   r"   rW   c                 C  s(   |  j d7  _ |tkr$|  jd7  _dS rq   )re   r   rž   r[   r   r   r
   r\   O  s    zCjkUncommonPlugin.feed_infoc                 C  s   d| _ d| _d S rj   r   r9   r   r   r
   r]   V  s    zCjkUncommonPlugin.resetr^   c                 C  s.   | j dk rdS | j| j  }|dkr*|d S dS )Nr   rk   r’   r›   r   )r:   Zuncommon_form_usager   r   r
   r_   Z  s    
zCjkUncommonPlugin.ratioN)
rQ   rR   rS   rT   rU   r;   r\   r]   r`   r_   r   r   r   r
   rœ   C  s   rœ   c                   @  sP   e Zd ZdZddœdd„Zddddœd	d
„Zddœdd„Zeddœdd„ƒZdS )ÚArchaicUpperLowerPlugin©	Ú_bufÚ_character_count_since_last_sepÚ_successive_upper_lower_countÚ#_successive_upper_lower_count_finalre   Ú_last_alpha_seenÚ_last_alpha_seen_upperÚ_last_alpha_seen_lowerÚ_current_ascii_onlyr5   r6   c                 C  s:   d| _ d| _d| _d| _d| _d | _d| _d| _d| _d S )NFr   Tr    r9   r   r   r
   r;   t  s    z ArchaicUpperLowerPlugin.__init__r<   r"   rW   c                 C  s
  |j o
|j}| }|rv| jdkrv| jdkrF|jsF| jsF|  j| j7  _d| _d| _d| _d| _|  j	d7  _	d| _dS | jrˆ|j
sˆd| _| jdk	rÔ|jrž| jsª|jrÎ| jrÎ| jrÆ|  jd7  _d| _qÔd| _nd| _|  j	d7  _	|  jd7  _|| _|j| _|j| _dS )rh   r   é@   NFr   Tri   )r&   r,   r¢   r*   r¨   r¤   r£   r¥   r¡   re   r+   r'   r§   r(   r¦   )r:   r$   rX   Zis_concernedZ	chunk_sepr   r   r
   r\   ƒ  sJ    ÿþýÿ
ÿÿz!ArchaicUpperLowerPlugin.feed_infoc                 C  s:   d| _ d| _d| _d| _d | _d| _d| _d| _d| _d S )Nr   FT)	re   r¢   r£   r¤   r¥   r¦   r§   r¡   r¨   r9   r   r   r
   r]   °  s    zArchaicUpperLowerPlugin.resetr^   c                 C  s   | j dkrdS | j| j  S )Nr   rk   )re   r¤   r9   r   r   r
   r_   »  s    
zArchaicUpperLowerPlugin.ratioNrm   r   r   r   r
   rŸ   f  s   -rŸ   c                   @  sP   e Zd ZdZddœdd„Zddœdd„Zdd	dd
œdd„Zeddœdd„ƒZdS )ÚArabicIsolatedFormPlugin©re   Ú_isolated_form_countr5   r6   c                 C  s   d| _ d| _d S rj   r«   r9   r   r   r
   r;   Ç  s    z!ArabicIsolatedFormPlugin.__init__c                 C  s   d| _ d| _d S rj   r«   r9   r   r   r
   r]   Ë  s    zArabicIsolatedFormPlugin.resetr<   r"   rW   c                 C  s*   |  j d7  _ |jt@ r&|  jd7  _dS rq   )re   r-   r   r¬   r[   r   r   r
   r\   Ï  s    
z"ArabicIsolatedFormPlugin.feed_infor^   c                 C  s   | j dk rdS | j| j  }|S )Nr   rk   r«   )r:   Zisolated_form_usager   r   r
   r_   Ö  s    
zArabicIsolatedFormPlugin.ratioN)	rQ   rR   rS   rU   r;   r]   r\   r`   r_   r   r   r   r
   rª   Ã  s   rª   é   )Úmaxsizez
str | NonerN   )r   r‚   r7   c                 C  s”  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS |   d¡|  d¡ }}|D ]}|tkr”q†||kr† dS q†| dk|dk }}|s¾|rÒd	| ksÎd	|krÒdS |rÞ|rÞdS d
| ksðd
|kr d	| ksd	|krdS | dks|dkr dS d	| ksHd	|ksH| dkr|dkrd| ks\d|kr`dS d| kstd|krxdS | dksŒ|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combiningú )ÚHiraganaÚKatakanaÚCJKZHangulzBasic Latin)r±   r°   ZPunctuationZForms)Úsplitr   )r   r‚   Zkeywords_range_aZkeywords_range_bÚelZrange_a_jp_charsZrange_b_jp_charsr   r   r
   r€   à  sh    ÿÿþÿúÿÿÿþr€   i   çš™™™™™É?Fr<   r^   )Údecoded_sequenceÚmaximum_thresholdÚdebugr7   c              	   C  sš  t | ƒ}|dk rd}n|dk r$d}nd}tƒ }tƒ }tƒ }tƒ }tƒ }	tƒ }
tƒ }tƒ }t	ƒ }|j
}|j
}|j
}|j
}|	j
}|
j
}|j
}|j
}|j
}tƒ }|j}td||ƒD ]ä}| ||| … D ]Š}||ƒ |||ƒ |||ƒ |||ƒ |jr|||ƒ |||ƒ |jrÀ|||ƒ |jr(|||ƒ |jr:|||ƒ |jrÀ|||ƒ qÀ|j|j |j |j |	j |
j |j |j |j }||kr¬ qîq¬|dƒ |d|ƒ |d|ƒ |d|ƒ |j|j |j |j |	j |
j |j |j |j }|rtdƒ}| td	|› d
|› d|› ¡ |dkrX| td| dd… › ¡ | td| dd… › ¡ |||||	|
|||f	D ] }| t|j› d|j› ¡ qnt|dƒS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    iÿ  rD   r­   r©   r=   r   Ú
Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=é   zStarting with: NzEnding with: iðÿÿÿz: r   )Úlenra   rn   rr   rv   r{   rƒ   rœ   rŸ   rª   r\   r"   rP   Úranger%   r&   r/   r0   r1   r_   r   Úlogr   Ú	__class__Úround)r¶   r·   r¸   Zseq_lenÚstepZd_spZd_taZd_upZd_sdaZd_srZd_swZd_cuZd_auZd_aiZ	d_sp_feedZ	d_ta_feedZ	d_up_feedZ
d_sda_feedZ	d_sr_feedZ	d_sw_feedZ	d_cu_feedZ	d_au_feedZ	d_ai_feedrX   Zinfo_updateZblock_startr$   Zmean_mess_ratioÚloggerÚdtr   r   r
   Ú
mess_ratio*  sÂ    







ÿþýüûúùøÿ


ÿþýüûúùøÿþ
rÃ   )rµ   F)1Ú
__future__r   ÚsysÚ	functoolsr   Úloggingr   Úversion_infoÚtypingr   Ztyping_extensionsÚImportErrorZconstantr   r   r   r   r   r   r   r   r   r   r   r   r   Úutilsr   r   r   r   r   r   r   r!   Ú__annotations__r"   rV   ra   rn   rr   rv   r{   rƒ   rœ   rŸ   rª   r€   rÃ   r   r   r   r
   Ú<module>   sT    
<$ 1,7 "\I   ÿ