zmc
2023-12-22 9fdbf60165db0400c2e8e6be2dc6e88138ac719a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
U
M±dIã@sZddlmZddlmZddlmZmZddlmZm    Z    m
Z
ddl m Z m Z mZmZmZmZmZmZmZmZmZmZmZmZmZmZGdd„dƒZGd    d
„d
eƒZGd d „d eƒZGd d„deƒZGdd„deƒZ Gdd„deƒZ!Gdd„deƒZ"Gdd„deƒZ#Gdd„deƒZ$eddee%ee%e&dœdd„ƒZ'eddd%e%e(e&e(d!œd"d#„ƒZ)d$S)&é)Ú    lru_cache)Ú    getLogger)ÚListÚOptionalé)ÚCOMMON_SAFE_ASCII_CHARACTERSÚTRACEÚUNICODE_SECONDARY_RANGE_KEYWORD)Úis_accentuatedÚis_asciiÚis_case_variableÚis_cjkÚ is_emoticonÚ    is_hangulÚ is_hiraganaÚ is_katakanaÚis_latinÚis_punctuationÚ is_separatorÚ    is_symbolÚis_thaiÚis_unprintableÚ remove_accentÚ unicode_rangec@sPeZdZdZeedœdd„Zeddœdd„Zddœd    d
„Ze    e
dœd d „ƒZ dS) ÚMessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    ©Ú    characterÚreturncCst‚dS)z@
        Determine if given character should be fed in.
        N©ÚNotImplementedError©Úselfr©r"úLd:\z\workplace\vscode\pyvenv\venv\Lib\site-packages\charset_normalizer/md.pyÚeligible$szMessDetectorPlugin.eligibleNcCst‚dS)z‰
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        Nrr r"r"r#Úfeed*szMessDetectorPlugin.feed©rcCst‚dS)zB
        Permit to reset the plugin to the initial state.
        Nr©r!r"r"r#Úreset1szMessDetectorPlugin.resetcCst‚dS)z…
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        Nrr'r"r"r#Úratio7szMessDetectorPlugin.ratio) Ú__name__Ú
__module__Ú __qualname__Ú__doc__ÚstrÚboolr$r%r(ÚpropertyÚfloatr)r"r"r"r#rs rc@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)Ú TooManySymbolOrPunctuationPluginNr&cCs"d|_d|_d|_d|_d|_dS)NrF)Ú_punctuation_countÚ _symbol_countÚ_character_countÚ_last_printable_charZ_frenzy_symbol_in_wordr'r"r"r#Ú__init__As
z)TooManySymbolOrPunctuationPlugin.__init__rcCs| ¡S©N©Ú isprintabler r"r"r#r$Isz)TooManySymbolOrPunctuationPlugin.eligiblecCsp|jd7_||jkrf|tkrft|ƒr8|jd7_n.| ¡dkrft|ƒrft|ƒdkrf|jd7_||_dS)NrFé)    r5r6rrr3Úisdigitrrr4r r"r"r#r%Lsÿþ
ÿþ
ýz%TooManySymbolOrPunctuationPlugin.feedcCsd|_d|_d|_dS©Nr)r3r5r4r'r"r"r#r(^sz&TooManySymbolOrPunctuationPlugin.resetcCs0|jdkrdS|j|j|j}|dkr,|SdS)Nrçg333333Ó?)r5r3r4)r!Zratio_of_punctuationr"r"r#r)cs 
 
þz&TooManySymbolOrPunctuationPlugin.ratio© r*r+r,r7r.r/r$r%r(r0r1r)r"r"r"r#r2@s r2c@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)ÚTooManyAccentuatedPluginNr&cCsd|_d|_dSr=©r5Ú_accentuated_countr'r"r"r#r7psz!TooManyAccentuatedPlugin.__init__rcCs| ¡Sr8)Úisalphar r"r"r#r$tsz!TooManyAccentuatedPlugin.eligiblecCs(|jd7_t|ƒr$|jd7_dS©Nr)r5r
rBr r"r"r#r%wszTooManyAccentuatedPlugin.feedcCsd|_d|_dSr=rAr'r"r"r#r(}szTooManyAccentuatedPlugin.resetcCs4|jdks|jdkrdS|j|j}|dkr0|SdS)Nrér>gffffffÖ?rA)r!Zratio_of_accentuationr"r"r#r)s zTooManyAccentuatedPlugin.ratior?r"r"r"r#r@os r@c@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)ÚUnprintablePluginNr&cCsd|_d|_dSr=)Ú_unprintable_countr5r'r"r"r#r7ŠszUnprintablePlugin.__init__rcCsdS©NTr"r r"r"r#r$ŽszUnprintablePlugin.eligiblecCs(t|ƒr|jd7_|jd7_dSrD)rrGr5r r"r"r#r%‘szUnprintablePlugin.feedcCs
d|_dSr=)rGr'r"r"r#r(–szUnprintablePlugin.resetcCs|jdkrdS|jd|jS)Nrr>rE)r5rGr'r"r"r#r)™s
zUnprintablePlugin.ratior?r"r"r"r#rF‰s rFc@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)ÚSuspiciousDuplicateAccentPluginNr&cCsd|_d|_d|_dSr=©Ú_successive_countr5Ú_last_latin_characterr'r"r"r#r7¢sz(SuspiciousDuplicateAccentPlugin.__init__rcCs| ¡ot|ƒSr8)rCrr r"r"r#r$¨sz(SuspiciousDuplicateAccentPlugin.eligiblecCst|jd7_|jdk    rjt|ƒrjt|jƒrj| ¡rJ|j ¡rJ|jd7_t|ƒt|jƒkrj|jd7_||_dSrD)r5rLr
ÚisupperrKrr r"r"r#r%«sÿþýz$SuspiciousDuplicateAccentPlugin.feedcCsd|_d|_d|_dSr=rJr'r"r"r#r(¹sz%SuspiciousDuplicateAccentPlugin.resetcCs|jdkrdS|jd|jS)Nrr>r;)r5rKr'r"r"r#r)¾s
z%SuspiciousDuplicateAccentPlugin.ratior?r"r"r"r#rI¡s rIc@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)ÚSuspiciousRangeNr&cCsd|_d|_d|_dSr=)Ú"_suspicious_successive_range_countr5Ú_last_printable_seenr'r"r"r#r7ÇszSuspiciousRange.__init__rcCs| ¡Sr8r9r r"r"r#r$ÌszSuspiciousRange.eligiblecCsx|jd7_| ¡s&t|ƒs&|tkr0d|_dS|jdkrD||_dSt|jƒ}t|ƒ}t||ƒrn|jd7_||_dSrD)r5ÚisspacerrrPrÚ is_suspiciously_successive_rangerO)r!rÚunicode_range_aÚunicode_range_br"r"r#r%Ïs"ÿþý
 
 
zSuspiciousRange.feedcCsd|_d|_d|_dSr=)r5rOrPr'r"r"r#r(æszSuspiciousRange.resetcCs.|jdkrdS|jd|j}|dkr*dS|S)Nrr>r;gš™™™™™¹?)r5rO)r!Zratio_of_suspicious_range_usager"r"r#r)ës
þzSuspiciousRange.ratior?r"r"r"r#rNÆs rNc@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)ÚSuperWeirdWordPluginNr&cCs:d|_d|_d|_d|_d|_d|_d|_d|_d|_dS)NrFÚ)    Ú _word_countÚ_bad_word_countÚ_foreign_long_countÚ_is_current_word_badÚ_foreign_long_watchr5Ú_bad_character_countÚ_bufferÚ_buffer_accent_countr'r"r"r#r7ûszSuperWeirdWordPlugin.__init__rcCsdSrHr"r r"r"r#r$    szSuperWeirdWordPlugin.eligiblecCsÚ| ¡r|j|7_t|ƒr,|jd7_|jdkrŒt|ƒdksJt|ƒrŒt|ƒdkrŒt|ƒdkrŒt|ƒdkrŒt    |ƒdkrŒt
|ƒdkrŒd|_dS|jsšdS|  ¡s´t |ƒs´t |ƒr |jr |jd7_t|jƒ}|j|7_|dkr6|j|dkrd|_t|jdƒr6|jd ¡r6|jd7_d|_|dkr\|jr\|jd7_d|_|jrŒ|jd7_|jt|jƒ7_d|_d|_d|_d    |_n6|d
krÖ| ¡dkrÖt|ƒrÖd|_|j|7_dS) NrFTégÃõ(\ÂÕ?éÿÿÿÿérVr>ú|ú<ú>ú-ú~Ú_ú=)rCr]r
r^r[rr rrrrrQrrrWÚlenr5rZrMrYrXr\r<r)r!rZ buffer_lengthr"r"r#r% svÿ
þþ
ý
ü
û
ú
ù    ÿÿÿþ
 
 ÿ
þýzSuperWeirdWordPlugin.feedcCs4d|_d|_d|_d|_d|_d|_d|_d|_dS)NrVFr)r]rZr[rXrWr5r\rYr'r"r"r#r(BszSuperWeirdWordPlugin.resetcCs$|jdkr|jdkrdS|j|jS)Né
rr>)rWrYr\r5r'r"r"r#r)LszSuperWeirdWordPlugin.ratior?r"r"r"r#rUús 6
rUc@s^eZdZdZddœdd„Zeedœdd„Zeddœd    d
„Zddœd d „Z    e
e dœd d„ƒZ dS)ÚCjkInvalidStopPluginu²
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    Nr&cCsd|_d|_dSr=©Ú_wrong_stop_countÚ_cjk_character_countr'r"r"r#r7ZszCjkInvalidStopPlugin.__init__rcCsdSrHr"r r"r"r#r$^szCjkInvalidStopPlugin.eligiblecCs4|dkr|jd7_dSt|ƒr0|jd7_dS)N>õ丄õ丅r)rmr rnr r"r"r#r%as
zCjkInvalidStopPlugin.feedcCsd|_d|_dSr=rlr'r"r"r#r(hszCjkInvalidStopPlugin.resetcCs|jdkrdS|j|jS)Nér>)rnrmr'r"r"r#r)ls
zCjkInvalidStopPlugin.ratio) r*r+r,r-r7r.r/r$r%r(r0r1r)r"r"r"r#rkTsrkc@sZeZdZddœdd„Zeedœdd„Zeddœdd    „Zddœd
d „Ze    e
dœd d „ƒZ dS)ÚArchaicUpperLowerPluginNr&cCs.d|_d|_d|_d|_d|_d|_d|_dS)NFrT)Ú_bufÚ_character_count_since_last_sepÚ_successive_upper_lower_countÚ#_successive_upper_lower_count_finalr5Ú_last_alpha_seenÚ_current_ascii_onlyr'r"r"r#r7tsz ArchaicUpperLowerPlugin.__init__rcCsdSrHr"r r"r"r#r$sz ArchaicUpperLowerPlugin.eligiblecCs$| ¡ot|ƒ}|dk}|r†|jdkr†|jdkrV| ¡dkrV|jdkrV|j|j7_d|_d|_d|_d|_|j    d7_    d|_dS|jdkr¢t
|ƒdkr¢d|_|jdk    rþ|  ¡r¾|j  ¡sÐ|  ¡rø|j  ¡rø|jdkrð|jd7_d|_qþd|_nd|_|j    d7_    |jd7_||_dS)NFré@rTr;) rCr rtr<rxrvrurwrsr5r rMÚislower)r!rZ is_concernedZ    chunk_sepr"r"r#r%„sFÿ
þýÿ
ÿÿ
zArchaicUpperLowerPlugin.feedcCs.d|_d|_d|_d|_d|_d|_d|_dS)NrFT)r5rtrurvrwrsrxr'r"r"r#r(®szArchaicUpperLowerPlugin.resetcCs|jdkrdS|j|jS)Nrr>)r5rvr'r"r"r#r)·s
zArchaicUpperLowerPlugin.ratior?r"r"r"r#rrss  *    rré)Úmaxsize)rSrTrcCs||dks|dkrdS||kr dSd|kr4d|kr4dSd|ksDd|krHdSd|ksXd|krld|kshd|krldS| d¡| d¡}}|D]}|tkr”q†||kr†dSq†|dk|dk}}|s¾|rÒd    |ksÎd    |krÒdS|rÞ|rÞdSd
|ksðd
|kr d    |ksd    |krdS|d ks|d kr dSd    |ksHd    |ksH|d krx|d krxd |ks\d |kr`dSd|kstd|krxdSdS)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ    EmoticonsZ    Combiningú )ÚHiraganaÚKatakanaÚCJKZHangulz Basic Latin)rr~Z PunctuationZForms)Úsplitr    )rSrTZkeywords_range_aZkeywords_range_bÚelZrange_a_jp_charsZrange_b_jp_charsr"r"r#rR¿shÿÿÿþÿúÿÿÿþrRi皙™™™™É?F)Údecoded_sequenceÚmaximum_thresholdÚdebugrc     CsXdd„t ¡Dƒ}t|ƒd}d}|dkr0d}n|dkr>d}nd    }t|d
t|ƒƒD]d\}}|D]}    |     |¡r`|     |¡q`|d krŽ||d ksš||dkrTtd d „|Dƒƒ}||krTqºqT|rNtdƒ}
|
     t
d|›d|›d|›¡t|ƒdkr(|
     t
d|dd…›¡|
     t
d|dd…›¡|D] } |
     t
| j ›d| j ›¡q,t |dƒS)zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    cSsg|]
}|ƒ‘qSr"r")Ú.0Zmd_classr"r"r#Ú
<listcomp>szmess_ratio.<locals>.<listcomp>rr>ié r{ryé€Ú
rcss|] }|jVqdSr8)r))r‡Údtr"r"r#Ú    <genexpr>%szmess_ratio.<locals>.<genexpr>Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=rqzStarting with: Nz Ending with: iðÿÿÿz: é)rÚ__subclasses__riÚzipÚranger$r%ÚsumrÚlogrÚ    __class__r)Úround) r„r…r†Z    detectorsÚlengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcrÚindexÚdetectorÚloggerrŒr"r"r#Ú
mess_ratiosFÿ 
 ÿ
ÿ
þþršN)rƒF)*Ú    functoolsrÚloggingrÚtypingrrZconstantrrr    Úutilsr
r r r rrrrrrrrrrrrrr2r@rFrIrNrUrkrrr.r/rRr1ršr"r"r"r#Ú<module>s6  H"/%4ZLþFÿþ