U
    /j2                     @  s   d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZmZ G d	d
 d
ZG dd dZeeef Zee ZG dd dZdS )    )annotations)aliases)dumps)sub)AnyIteratorListTuple   )RE_POSSIBLE_ENCODING_INDICATIONTOO_BIG_SEQUENCE)	iana_nameis_multi_byte_encodingunicode_rangec                	   @  s  e Zd ZdCddddddddd	d
ZdddddZdddddZeddddZddddZddddZ	d ddddZ
eddddZeddddZeddd d!Zeddd"d#Zeddd$d%Zeddd&d'Zeddd(d)Zeddd*d+Zeddd,d-Zeddd.d/Zeddd0d1Zed2dd3d4Zeddd5d6Zeddd7d8Zeddd9d:ZdDdd<d=d>d?Zed@ddAdBZdS )ECharsetMatchNzbytes | bytearraystrfloatboolCoherenceMatches
str | None)payloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesdecoded_payloadpreemptive_declarationc                 C  sL   || _ || _|| _|| _|| _d | _g | _d| _d | _d | _	|| _
|| _d S )N        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leavesZ_mean_coherence_ratio_output_payload_output_encoding_string_preemptive_declaration)selfr   r   r   r   r   r   r    r*   =/tmp/pip-unpacked-wheel-s9saxz3h/charset_normalizer/models.py__init__   s    
zCharsetMatch.__init__object)otherreturnc                 C  s>   t |ts&t |tr"t|| jkS dS | j|jko<| j|jkS )NF)
isinstancer   r   r   encodingfingerprintr)   r.   r*   r*   r+   __eq__)   s
    

zCharsetMatch.__eq__c                 C  s   t |tstt| j|j }t| j|j }|dk rJ|dkrJ| j|jkS |dk r|dkrt| jtkrt| j|jk S | j	|j	kS | j|jk S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gzt?g{Gz?)
r0   r   
ValueErrorabschaos	coherencelenr   r   multi_byte_usage)r)   r.   Zchaos_differenceZcoherence_differencer*   r*   r+   __lt__0   s    
zCharsetMatch.__lt__r/   c                 C  s   dt t| t | j  S )Ng      ?)r9   r   rawr)   r*   r*   r+   r:   F   s    zCharsetMatch.multi_byte_usagec                 C  sV   | j d krPt| j| jd| _ | jrP| jdkrP| j rP| j d dkrP| j dd  | _ | j S )Nstrictutf_7r   u   ﻿r
   )r'   r   r   r   r"   r>   r*   r*   r+   __str__J   s    
zCharsetMatch.__str__c                 C  s   d| j  d| j dS )Nz<CharsetMatch 'z' fp(z)>)r1   r2   r>   r*   r*   r+   __repr__Z   s    zCharsetMatch.__repr__Nonec                 C  s8   t |tr|| kr"td|jd |_| j| d S )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r0   r   r5   format	__class__r'   r$   appendr3   r*   r*   r+   add_submatch]   s    zCharsetMatch.add_submatchc                 C  s   | j S N)r   r>   r*   r*   r+   r1   h   s    zCharsetMatch.encoding	list[str]c                 C  sD   g }t  D ]2\}}| j|kr*|| q| j|kr|| q|S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr1   rF   )r)   Zalso_known_asupr*   r*   r+   encoding_aliasesl   s    

zCharsetMatch.encoding_aliasesc                 C  s   | j S rH   r"   r>   r*   r*   r+   bomy   s    zCharsetMatch.bomc                 C  s   | j S rH   rN   r>   r*   r*   r+   byte_order_mark}   s    zCharsetMatch.byte_order_markc                 C  s   dd | j D S )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        c                 S  s   g | ]}|d  qS )r   r*   ).0er*   r*   r+   
<listcomp>   s     z*CharsetMatch.languages.<locals>.<listcomp>r!   r>   r*   r*   r+   r      s    zCharsetMatch.languagesc                 C  sp   | j sbd| jkrdS ddlm}m} t| jr8|| jn|| j}t|dksVd|krZdS |d S | j d d S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiZEnglishr   )encoding_languagesmb_encoding_languageszLatin BasedUnknown)r!   could_be_from_charsetZcharset_normalizer.cdrV   rW   r   r1   r9   )r)   rV   rW   r   r*   r*   r+   language   s    
zCharsetMatch.languagec                 C  s   | j S rH   )r    r>   r*   r*   r+   r7      s    zCharsetMatch.chaosc                 C  s   | j s
dS | j d d S )Nr   r   r
   rT   r>   r*   r*   r+   r8      s    zCharsetMatch.coherencec                 C  s   t | jd ddS Nd      )ndigits)roundr7   r>   r*   r*   r+   percent_chaos   s    zCharsetMatch.percent_chaosc                 C  s   t | jd ddS r[   )r_   r8   r>   r*   r*   r+   percent_coherence   s    zCharsetMatch.percent_coherencec                 C  s   | j S )z+
        Original untouched bytes.
        )r   r>   r*   r*   r+   r=      s    zCharsetMatch.rawzlist[CharsetMatch]c                 C  s   | j S rH   )r$   r>   r*   r*   r+   submatch   s    zCharsetMatch.submatchc                 C  s   t | jdkS Nr   )r9   r$   r>   r*   r*   r+   has_submatch   s    zCharsetMatch.has_submatchc                 C  s@   | j d k	r| j S dd t| D }ttdd |D | _ | j S )Nc                 S  s   g | ]}t |qS r*   )r   )rQ   charr*   r*   r+   rS      s     z*CharsetMatch.alphabets.<locals>.<listcomp>c                 S  s   h | ]}|r|qS r*   r*   )rQ   rr*   r*   r+   	<setcomp>   s      z)CharsetMatch.alphabets.<locals>.<setcomp>)r#   r   sortedlist)r)   Zdetected_rangesr*   r*   r+   	alphabets   s
    
zCharsetMatch.alphabetsc                 C  s   | j gdd | jD  S )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        c                 S  s   g | ]
}|j qS r*   )r1   )rQ   mr*   r*   r+   rS      s     z6CharsetMatch.could_be_from_charset.<locals>.<listcomp>)r   r$   r>   r*   r*   r+   rY      s    z"CharsetMatch.could_be_from_charsetutf_8bytes)r1   r/   c                   s~    j dks j |krx| _ t } jdk	rj j dkrjtt fdd|dd dd}||dd  }||d _ jS )	z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        N)zutf-8utf8rl   c                   s<   | j |  d |  d  |  d t jddS )Nr   r
   _-)stringspanreplacegroupsr   r&   )rk   r>   r*   r+   <lambda>   s   
z%CharsetMatch.output.<locals>.<lambda>i    r
   )countrs   )r&   r   r(   lowerr   r   encoder%   )r)   r1   decoded_stringZpatched_headerr*   r>   r+   output   s$    


zCharsetMatch.outputintc                 C  s   t t| S )z]
        Retrieve a hash fingerprint of the decoded payload, used for deduplication.
        )hashr   r>   r*   r*   r+   r2      s    zCharsetMatch.fingerprint)NN)rl   )__name__
__module____qualname__r,   r4   r;   propertyr:   rA   rB   rG   r1   rM   rO   rP   r   rZ   r7   r8   r`   ra   r=   rb   rd   rj   rY   rz   r2   r*   r*   r*   r+   r      sV     	r   c                   @  s   e Zd ZdZdddddZddd	d
ZdddddZddddZddddZdddddZ	ddddZ
ddddZdS )CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    Nzlist[CharsetMatch] | None)resultsc                 C  s   |rt |ng | _d S rH   )rh   _results)r)   r   r*   r*   r+   r,     s    zCharsetMatches.__init__zIterator[CharsetMatch]r<   c                 c  s   | j E d H  d S rH   r   r>   r*   r*   r+   __iter__  s    zCharsetMatches.__iter__z	int | strr   )itemr/   c                 C  sN   t |tr| j| S t |trFt|d}| jD ]}||jkr.|  S q.tdS )z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        FN)r0   r{   r   r   r   rY   KeyError)r)   r   resultr*   r*   r+   __getitem__
  s    






zCharsetMatches.__getitem__r{   c                 C  s
   t | jS rH   r9   r   r>   r*   r*   r+   __len__  s    zCharsetMatches.__len__r   c                 C  s   t | jdkS rc   r   r>   r*   r*   r+   __bool__  s    zCharsetMatches.__bool__rC   c                 C  s|   t |tstdt|jt|jtk r`| j	D ],}|j
|j
kr2|j|jkr2||  dS q2| j	| t| j	| _	dS )z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r0   r   r5   rD   r   rE   r9   r=   r   r   r2   r7   rG   rF   rh   )r)   r   matchr*   r*   r+   rF     s    


zCharsetMatches.appendzCharsetMatch | Nonec                 C  s   | j s
dS | j d S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   r   r>   r*   r*   r+   best2  s    zCharsetMatches.bestc                 C  s   |   S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )r   r>   r*   r*   r+   first:  s    zCharsetMatches.first)N)r}   r~   r   __doc__r,   r   r   r   r   rF   r   r   r*   r*   r*   r+   r      s   r   c                   @  sN   e Zd ZddddddddddddddZed	d
ddZdd
ddZdS )CliDetectionResultr   r   rI   r   r   pathr1   rM   alternative_encodingsrZ   rj   r   r7   r8   unicode_pathis_preferredc                 C  sF   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
d S rH   )r   r   r1   rM   r   rZ   rj   r   r7   r8   r   )r)   r   r1   rM   r   rZ   rj   r   r7   r8   r   r   r*   r*   r+   r,   F  s    zCliDetectionResult.__init__zdict[str, Any]r<   c                 C  s2   | j | j| j| j| j| j| j| j| j| j	| j
dS )Nr   r   r>   r*   r*   r+   __dict__`  s    zCliDetectionResult.__dict__c                 C  s   t | jdddS )NT   )ensure_asciiindent)r   r   r>   r*   r*   r+   to_jsonp  s    zCliDetectionResult.to_jsonN)r}   r~   r   r,   r   r   r   r*   r*   r*   r+   r   E  s   "r   N)
__future__r   Zencodings.aliasesr   jsonr   rer   typingr   r   r   r	   Zconstantr   r   utilsr   r   r   r   r   r   r   ZCoherenceMatchr   r   r*   r*   r*   r+   <module>   s    sC