-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcodecs_01.py
More file actions
193 lines (175 loc) · 8.58 KB
/
codecs_01.py
File metadata and controls
193 lines (175 loc) · 8.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# -*- coding: utf-8 -*-
# __author__ = 'k22li'
#the often use codecs for vary languages as following
"""
Codec Aliases Languages
ascii 646,us-ascii English
big5 big5-tw,csbig5 Traditional Chinese
big5hkscs big5-hkscs,hkscs Traditional Chinese
cp037 IBM037,IBM039 English
cp424 EBCDIC-CP-HE,IBM424 Hebrew
cp437 437,IBM437 English
cp500 EBCDIC-CP-BE,EBCDIC-CP-CH,IBM500 Western Europe
cp720 Arabic
cp737 Greek
cp775 IBM775 Baltic languages
cp850 850,IBM850 Western Europe
cp852 852,IBM852 Central and Eastern Europe
cp855 855,IBM855 Bulgarian,Byelorussian,Macedonian,Russian,Serbian
cp856 Hebrew
cp857 857,IBM857 Turkish
cp858 858,IBM858 Western Europe
cp860 860,IBM860 Portuguese
cp861 861,CP-IS,IBM861 Icelandic
cp862 862,IBM862 Hebrew
cp863 863,IBM863 Canadian
cp864 IBM864 Arabic
cp865 865,IBM865 Danish,Norwegian
cp866 866,IBM866 Russian
cp869 869,CP-GR,IBM869 Greek
cp874 Thai
cp875 Greek
cp932 932,ms932,mskanji,ms-kanji Japanese
cp949 949,ms949,uhc Korean
cp950 950,ms950 Traditional Chinese
cp1006 Urdu
cp1026 ibm1026 Turkish
cp1140 ibm1140 Western Europe
cp1250 windows-1250 Central and Eastern Europe
cp1251 windows-1251 Bulgarian,Byelorussian,Macedonian,Russian,Serbian
cp1252 windows-1252 Western Europe
cp1253 windows-1253 Greek
cp1254 windows-1254 Turkish
cp1255 windows-1255 Hebrew
cp1256 windows-1256 Arabic
cp1257 windows-1257 Baltic languages
cp1258 windows-1258 Vietnamese
euc_jp eucjp,ujis,u-jis Japanese
euc_jis_2004 jisx0213,eucjis2004 Japanese
euc_jisx0213 eucjisx0213 Japanese
euc_kr euckr,korean,ksc5601,ks_c-5601,ks_c-5601-1987,ksx1001,ks_x-1001 Korean
gb2312 chinese,csiso58gb231280,euc- cn,euccn,eucgb2312-cn,gb2312-1980,gb2312-80,iso- ir-58 Simplified Chinese
gbk 936,cp936,ms936 Unified Chinese
gb18030 gb18030-2000 Unified Chinese
hz hzgb,hz-gb,hz-gb-2312 Simplified Chinese
iso2022_jp csiso2022jp,iso2022jp,iso-2022-jp Japanese
iso2022_jp_1 iso2022jp-1,iso-2022-jp-1 Japanese
iso2022_jp_2 iso2022jp-2,iso-2022-jp-2 Japanese,Korean,Simplified Chinese,Western Europe,Greek
iso2022_jp_2004 iso2022jp-2004,iso-2022-jp-2004 Japanese
iso2022_jp_3 iso2022jp-3,iso-2022-jp-3 Japanese
iso2022_jp_ext iso2022jp-ext,iso-2022-jp-ext Japanese
iso2022_kr csiso2022kr,iso2022kr,iso-2022-kr Korean
latin_1 iso-8859-1,iso8859-1,8859,cp819,latin,latin1,L1 West Europe
iso8859_2 iso-8859-2,latin2,L2 Central and Eastern Europe
iso8859_3 iso-8859-3,latin3,L3 Esperanto,Maltese
iso8859_4 iso-8859-4,latin4,L4 Baltic languages
iso8859_5 iso-8859-5,cyrillic Bulgarian,Byelorussian,Macedonian,Russian,Serbian
iso8859_6 iso-8859-6,arabic Arabic
iso8859_7 iso-8859-7,greek,greek8 Greek
iso8859_8 iso-8859-8,hebrew Hebrew
iso8859_9 iso-8859-9,latin5,L5 Turkish
iso8859_10 iso-8859-10,latin6,L6 Nordic languages
iso8859_13 iso-8859-13,latin7,L7 Baltic languages
iso8859_14 iso-8859-14,latin8,L8 Celtic languages
iso8859_15 iso-8859-15,latin9,L9 Western Europe
iso8859_16 iso-8859-16,latin10,L10 South-Eastern Europe
johab cp1361,ms1361 Korean
koi8_r Russian
koi8_u Ukrainian
mac_cyrillic maccyrillic Bulgarian,Byelorussian,Macedonian,Russian,Serbian
mac_greek macgreek Greek
mac_iceland maciceland Icelandic
mac_latin2 maclatin2,maccentraleurope Central and Eastern Europe
mac_roman macroman Western Europe
mac_turkish macturkish Turkish
ptcp154 csptcp154,pt154,cp154,cyrillic-asian Kazakh
shift_jis csshiftjis,shiftjis,sjis,s_jis Japanese
shift_jis_2004 shiftjis2004,sjis_2004,sjis2004 Japanese
shift_jisx0213 shiftjisx0213,sjisx0213,s_jisx0213 Japanese
utf_32 U32,utf32 all languages
utf_32_be UTF-32BE all languages
utf_32_le UTF-32LE all languages
utf_16 U16,utf16 all languages
utf_16_be UTF-16BE all languages (BMP only)
utf_16_le UTF-16LE all languages (BMP only)
utf_7 U7,unicode-1-1-utf-7 all languages
utf_8 U8,UTF,utf8 all languages
utf_8_sig all languages
"""
import sys
class decoder():
def __init__(self, string):
"""init the class with params"""
str = string
def translateToUnicode(self, string = str):
if isinstance(string, unicode):
print('Already the Unicode itself!')
return string
elif isinstance(string.decode('utf-8'), unicode):
strUnicode = string.decode('utf-8')
return strUnicode
def translateIntoRawUnicodeEscape(self, string = str):
if not isinstance(string, unicode):
string = self.translateToUnicode(string = string)
return string.encode('raw_unicode_escape')
def translateIntoUtf8Code(self, string = str):
# if isinstance(string,)
return string.decode('raw_unicode_escape')
if __name__ == '__main__':
greetingText = 'pls. provide the raw string you want to translate:'
greetingText = greetingText.title() #cameral case strings
inputs = raw_input('%s'%greetingText)
decoderInstance = decoder(inputs)
unicodeStr = decoderInstance.translateToUnicode(inputs)
rawUnicodeStr = decoderInstance.translateIntoRawUnicodeEscape(inputs)
for val in [unicodeStr, rawUnicodeStr]:
print val
"""
为什么会报错“UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)”?本文就来研究一下这个问题。
字符串在Python内部的表示是unicode编码,因此,在做编码转换时,通常需要以unicode作为中间编码,即先将其他编码的字符串解码(decode)成unicode,再从unicode编码(encode)成另一种编码。
decode的作用是将其他编码的字符串转换成unicode编码,如str1.decode('gb2312'),表示将gb2312编码的字符串str1转换成unicode编码。
encode的作用是将unicode编码转换成其他编码的字符串,如str2.encode('gb2312'),表示将unicode编码的字符串str2转换成gb2312编码。
因此,转码的时候一定要先搞明白,字符串str是什么编码,然后decode成unicode,然后再encode成其他编码
"""
"""
代码中字符串的默认编码与代码文件本身的编码一致。
如:s='中文'
如果是在utf8的文件中,该字符串就是utf8编码,如果是在gb2312的文件中,则其编码为gb2312。这种情况下,要进行编码转换,都需要先用decode方法将其转换成unicode编码,再使用encode方法将其转换成其他编码。通常,在没有指定特定的编码方式时,都是使用的系统默认编码创建的代码文件。
如果字符串是这样定义:s=u'中文'
则该字符串的编码就被指定为unicode了,即python的内部编码,而与代码文件本身的编码无关。因此,对于这种情况做编码转换,只需要直接使用encode方法将其转换成指定编码即可。
如果一个字符串已经是unicode了,再进行解码则将出错,因此通常要对其编码方式是否为unicode进行判断:
isinstance(s, unicode) #用来判断是否为unicode
用非unicode编码形式的str来encode会报错
"""
"""
如何获得系统的默认编码?
#!/usr/bin/env python
#coding=utf-8
import sys
print sys.getdefaultencoding()
该段程序在英文WindowsXP上输出为:ascii
在某些IDE中,字符串的输出总是出现乱码,甚至错误,其实是由于IDE的结果输出控制台自身不能显示字符串的编码,而不是程序本身的问题。
如在UliPad中运行如下代码:
s=u"中文"
print s
会提示:UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)。这是因为UliPad在英文WindowsXP上的控制台信息输出窗口是按照ascii编码输出的(英文系统的默认编码是ascii),而上面代码中的字符串是Unicode编码的,所以输出时产生了错误。
将最后一句改为:print s.encode('gb2312')
则能正确输出“中文”两个字。
若最后一句改为:print s.encode('utf8')
则输出:\xe4\xb8\xad\xe6\x96\x87,这是控制台信息输出窗口按照ascii编码输出utf8编码的字符串的结果。
unicode(str,'gb2312')与str.decode('gb2312')是一样的,都是将gb2312编码的str转为unicode编码
使用str.__class__可以查看str的编码形式
"""
"""
原理说了半天,最后来个包治百病的吧:)
复制代码 代码如下:
#!/usr/bin/env python
#coding=utf-8
s="中文"
if isinstance(s, unicode):
#s=u"中文"
print s.encode('gb2312')
else:
#s="中文"
print s.decode('utf-8').encode('gb2312')
"""