不過如果檔案不是 ascii 就....
所以還是要花點功
首先是自己土法煉鋼靠 BOM 識別法
def test_file_encoding(file_path):
file_encoding = sys.getfilesystemencoding()
bom_len = 0
with open(file_path, 'r') as f:
head = f.read(5)
if head[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
file_encoding = 'utf-16-le'
bom_len = 1
elif head[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
file_encoding = 'utf-16-be'
bom_len = 1
elif head[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
file_encoding = 'utf-8'
bom_len = 1
return (file_encoding, bom_len)
def parse_file(file_path):
(file_encoding, bom_len) = test_file_encoding(file_path)
with codecs.open(file_path, mode='r', encoding=file_encoding) as f:
f.read(bom_len)
for line in f: # type of line is 'unicode'
# do the job
這樣大部分情況 ok,不過遇到沒 BOM 又不是 system default codepage 的就屎惹
所以還是引進這個有趣的 lib: Universal Encoding Detector
改寫成這樣:
def test_file_encoding(file_path):
with open(file_path, 'r') as f:
line = f.readline()
encoding = chardet.detect(line)['encoding']
bom_len = 0
try:
line.encode(encoding)
except:
bom_len = 1
return (encoding, bom_len)
return (sys.getfilesystemencoding(), 0)
沒有留言:
張貼留言