zmc
2023-08-08 e792e9a60d958b93aef96050644f369feb25d61b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
U
­ý°dg'ã @s*dZddlmZmZddlZddlZddlZddlZddl    Z    ddl
m Z m Z ddl mZe    j d¡Ze    j d¡Zdd„Zed    d
„ƒZee    j d d d g¡e    j ddddg¡dd„ƒƒƒZdd„Zdd„Ze    j ddie ddgiƒfdddie ddgiƒfdd dgie dd!d"giƒfd#dgd$d%œe ddgiƒfd#dgd&d%œe dejdgiƒfg¡d'd(„ƒZd)d*„Ze    j d+d,d-d.g¡d/d0„ƒZee    j d1d$d&g¡d2d3„ƒƒZed4d5„ƒZe    j dd6dd7d8d9g¡d:d;„ƒZ ee    j dd6ddd<d=g¡d>d?„ƒƒZ!ed@dA„ƒZ"edBdC„ƒZ#e    j d¡e    j dDdEdFg¡dGdH„ƒƒZ$dS)IzZ
Tests encoding functionality during parsing
for all of the parsers defined in parsers.py
é)ÚBytesIOÚ TextIOWrapperN)Ú    DataFrameÚread_csvZ pyarrow_skipZ pyarrow_xfailcCsLd}|}td |¡ƒ}|j|d|d}tddggddgd    }t ||¡dS)
NÚcp1255uשלום:1234
562:123ú:)ÚsepÚencodingi2é{uשלוםZ1234©Úcolumns©rÚencoderrÚtmÚassert_frame_equal)Ú all_parsersr    ÚparserÚdataÚresultÚexpected©rú[d:\z\workplace\vscode\pyvenv\venv\Lib\site-packages\pandas/tests/io/parser/test_encoding.pyÚtest_bytes_io_inputs rcCs@|}td ¡ƒ}|j|dddd}tddggƒ}t ||¡dS)Nu Łaski, Jan;1ú;úutf-8)rr    Úheaderu Łaski, Janér )rrrrrrrrÚtest_read_csv_unicode%s
 rrú,ú    r    úutf-16zutf-16lezutf-16bec
CsÌ|}d d|¡}dt ¡›d}|ddœ}d}t |¡Š}| |¡}t|dƒ}    |     |¡W5QRXtt    | |¡ƒ|d    2}
|j
|fd
|i|—Ž} |j
|
fd
|i|—Ž} W5QRXt  | | ¡W5QRXdS) Nz)skip this
skip this too
A,B,C
1,2,3
4,5,6rÚ__z__.csvé)rZskiprowsrÚwb©r    r    ) ÚreplaceÚuuidÚuuid4rÚ ensure_cleanrÚopenÚwriterrrr) rrr    rrÚpathÚkwargsÚutf8Ú
bytes_dataÚfZ bytes_bufferrrrrrÚtest_utf16_bom_skiprows/s û
 
  r0cCs6tj |d¡}|}|j|ddd}t|ƒdks2t‚dS)Nz utf16_ex.txtr r)r    ré2)Úosr+ÚjoinrÚlenÚAssertionError)rÚ csv_dir_pathr+rrrrrÚtest_utf16_exampleLsr7cCsLtj |d¡}|}|j|ddd}| d¡}|dd}d}||ksHt‚dS)Núunicode_series.csvúlatin-1)rr    rri`u$Á köldum klaka (Cold Fever) (1994))r2r+r3rZ    set_indexr5)rr6r+rrÚgotrrrrÚtest_unicode_encodingSs
 r;zdata,kwargs,expectedza
1Úarz"a"
1Ú    quotecharú"zb
1ÚnamesÚbÚ1ú
1T)r?Úskip_blank_linesFcsx|}d‰d‰‡‡fdd„}|jdkrN|dkrN| dd¡rN|j tjjd    d
¡|j||ƒfd ˆi|—Ž}t     ||¡dS) Nurcsˆ| ˆ¡}t|ƒS)N)rr)Ú_dataZbom_data©Úbomr-rrÚ_encode_data_with_bomxsz,test_utf8_bom.<locals>._encode_data_with_bomZpyarrowrBrCTzPyarrow can't read blank lines)Úreasonr    )
ÚengineÚgetÚnodeZ
add_markerÚpytestÚmarkZxfailrrr)rrr,rÚrequestrrGrrrErÚ test_utf8_bom_sÿþ
ý ÿrOcCsLtdgdgdœƒ}|}| |¡}d |¡}|jt|ƒ|d}t ||¡dS)Ng333333@Útest)Zmb_numZ    multibytezmb_num,multibyte
4.8,testr$)rÚformatrrrrr)rÚ    utf_valueÚ encoding_fmtrrr    rrrrrÚtest_read_csv_utf_aliasesŠs 
 
rTzfile_path,encoding))ÚiorÚcsvz    test1.csvr))rUrrr8r9))rUrrzsauron.SHIFT_JIS.csvÚshiftjisc
    CsÐ|}||Ž}|j||d}t||d}| |¡}|jr<t‚W5QRXt ||¡t|dd}    |j|    |d}|    jrxt‚W5QRXt ||¡t|ddd}    |j|    |d}|    jr¶t‚W5QRXt ||¡dS)Nr$Úrb©Úmoder)rZÚ    buffering)rr)Úclosedr5rr)
rÚ    file_pathr    ÚdatapathrZfpathrÚfarZfbrrrÚtest_binary_mode_file_buffers–s 
  r`Ú pass_encodingc        Csr|}| |¡}tddgiƒ}tjd|dd<}| d¡| d¡|j||rP|ndd}t ||¡W5QRXdS)    NZfooÚbarzw+T)rZr    Zreturn_filelikezfoo
barrr$)rQrrr(r*Úseekrr)    rrRrSrarr    rr/rrrrÚtest_encoding_temp_file¶s
 
 
rdc    Cs~|}d}d}d}t||giƒ}t ¡N}| |›d|› |¡¡| d¡|j||d}t ||¡|j    rpt
‚W5QRXdS)Nz    shift-jisu    ã¦ã™ã¨uこむÚ
rr$) rÚtempfileÚNamedTemporaryFiler*rrcrrrr\r5)rrr    Útitlerrr/rrrrÚtest_encoding_named_temp_fileÇs
 
 rirz    utf-16-bez    utf-16-lezutf-32cCsRd}t| |¡ƒ}t|d|d}tddgddgdd    ggd
d gd }t ||¡dS) Nua    b
:foo    0
bar    1
baz    2r)Ú    delimiterr    u:foorrbrZbazr"r<r@)rr r )r    rZ encoded_datarrrrrÚ%test_parse_encoded_special_charactersÜs
"rkrr9c    Csp|}tddddgddddgd    d
d d gd œƒ}t ¡&}|j|d|d|j||dd}W5QRXt ||¡dS)NZRaphaelZ    Donatelloz Miguel AngelZLeonardoÚredZpurpleZorangeÚblueZsaizbo staffZnunchunkZkatana)ÚnameÚmaskZweaponF)Úindexr    T)r    Ú
memory_map)rrr(Úto_csvrr)rr    rrÚfileÚdfrrrÚtest_encoding_memory_mapês
 
 
ýÿ
ruc    Csh|}tdgdd}d|jd<t d¡*}|j|dddd    |j|d
d d d }W5QRXt ||¡d
S)zO
    Chunk splits a multibyte character with memory_map=True
 
    GH 43540
    Zaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaai)ruaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaąiÿzbug-gh43540.csvFr©rprr    NTÚc)rrqrI)rZilocrr(rrrr)rrrtÚfnameÚdfrrrrÚ test_chunk_splits_multibyte_charüs
 rzc     CsÖg}d}d}d}tt|ƒt|ƒ|ƒD]X}d dd„t||dƒDƒ¡d}z| d¡Wntk
rpYq$YnX| |¡q$|}t|ƒ}t d    ¡,}    |j    |    d
d
dd |j
|    d d ddd}
W5QRXt  ||
¡d S)zg
    GH 43787
 
    Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
    é€ú u𐂀ÚcSsg|] }t|ƒ‘qSr)Úchr)Ú.0rwrrrÚ
<listcomp>sz,test_readcsv_memmap_utf8.<locals>.<listcomp>rerz utf8test.csvFrvNTrw)rrqrIr    ) ÚrangeÚordr3rÚUnicodeEncodeErrorÚappendrrr(rrrr) rÚlinesZ line_lengthZ
start_charZend_charÚlnumÚlinerrtrxryrrrÚtest_readcsv_memmap_utf8s."
  ÿrˆrZzw+bzw+tc    Csh|}d}d|krd}tj|d$}| |¡| d¡| |¡}W5QRXtgdgd}t ||¡dS)NsabcdÚtZabcdrYrr )rfÚSpooledTemporaryFiler*rcrrrr)rrZrÚcontentÚhandlertrrrrÚtest_not_readable.s
 
r)%Ú__doc__rUrrr2rfr&ÚnumpyÚnprLZpandasrrZpandas._testingZ_testingrrMZ usefixturesZ skip_pyarrowZ xfail_pyarrowrrZ parametrizer0r7r;ÚnanrOrTr`rdrirkrurzrˆrrrrrÚ<module>sz    
     
ýöþ
 ýþ
 
 ÿ