If you work with documents from many different sources, you’ve probably seen this before:
That’s good.
“Oh no”, you think, “A utf-8 encoding problem.” That three-letter combo should be a single close-quote, like this:
That’s good.
Sometimes the problem is that your application is reading the file as a win-1252 (or cp1252, or the kinda-sorta iso-8859-1 used on the web). In this case the solution is easy: instruct your application to reopen the file as utf8.
But sometimes, your file really does say “’”, even when decoded as utf-8. How this happens is that someone took some utf8 text, pasted it into a win1252 document, and then saved the document as utf8. So now the bytes in your document are:
That
[c3][a2][e2][82][ac][e2][84][a2]s good
instead of
That
[e2][80][99]s good.
So how do you fix it?
I wrote a tool.
The Python code below uses Python’s codec interface to register a simple stateless encoder that turns these utf8 gremlin bytes back into pure utf8 bytes. You can use it from the command line like removeUTF8Gremlins.py infile.txt -o outfile.txt or you can use it as a library by importing it and then using the CP1252asUTF8gremlins pseudo-codec anywhere you can use a stateless codec.
#!/usr/bin/env python
# encoding: utf-8
# BSD LICENSE
# Copyright (c) 2010, Dancing Mammoth Inc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice, this
# list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
# Neither the name of Dancing Mammoth nor the names of its contributors may
# be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
removeUTF8Gremlins.py
Will recode file with utf8 gremlins to a proper utf8 file.
When used as a library, will register the codec 'CP1252asUTF8gremlins', which
provides a stateless decoder which will convert bytes with gremlins into pure
utf8 bytes.
We call a 'utf8 gremlin' a byte sequence that results when a utf8 byte is read
as a cp1252 encoding into unicode chars, and then written out as utf8.
The tell-tale sign of it is bytes that look like this in a file read as utf8.
Original: That’s good.
Bytes as utf8: That[e2][80][99]s good.
When read as CP1252: That’s good. (acute lowercase a, euro symbol, trademark symbol)
Bytes as utf8 gremlins: That[c3][a2][e2][82][ac][e2][84][a2]s good.
This utility turns "Bytes as utf8 gremlins" back into "Bytes as utf8"
Created by Francis Avila on 2010-10-27.
Copyright (c) 2010 Dancing Mammoth, Inc. All rights reserved.
"""
import sys
import getopt
import codecs
import re
help_message = '''
Fix a conversion error where a utf8 file got interpreted as a win1252 file
and then saved as utf8, producing three-character multibyte gremlins.
'''
def win1252_to_utf8_gremlin_table(mapping={}):
if mapping:
return mapping
def makemapping(mapping):
for i in range(256):
byte = ('%02x' % i).decode('hex_codec')
try:
cp1252uni = byte.decode('cp1252')
except UnicodeDecodeError:
cp1252uni = byte.decode('iso-8859-1')
if cp1252uni:
realutf8 = cp1252uni.encode('utf-8')
try:
asuni = realutf8.decode('cp1252')
except UnicodeDecodeError:
asuni = realutf8.decode('iso-8859-1')
if asuni:
utf8gremlin = asuni.encode('utf8')
mapping[utf8gremlin] = realutf8
makemapping(mapping)
return mapping
def win1252_to_utf8_gremlin_re():
mapping = win1252_to_utf8_gremlin_table()
rechars = []
for k,v in mapping.items():
if k != v:
rechars.append(k.encode('string_escape'))
regex = '(?:%s)' % '|'.join(rechars)
return re.compile(regex)
def reverse_win1252_to_utf8_gremlins(bytes, errors='strict'):
regex = win1252_to_utf8_gremlin_re()
mapping = win1252_to_utf8_gremlin_table()
def replace(mo):
try:
newchar = mapping[mo.group(0)]
except KeyError:
if errors=='strict':
raise ValueError('Encountered bytes with no pure utf8 equivalent.')
else:
if errors=='ignore':
newchar = ''
elif errors=='replace':
newchar = '?'
return newchar
newbytes = re.sub(regex, replace, bytes)
return (newbytes, len(bytes))
def register_win1252_to_utf8_gremlins(encoding):
ci = None
if encoding == 'cp1252asutf8gremlins':
ci = codecs.CodecInfo(None, reverse_win1252_to_utf8_gremlins, name='CP1252asUTF8gremlins')
return ci
codecs.register(register_win1252_to_utf8_gremlins)
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main(argv=None):
if argv is None:
argv = sys.argv
options = {}
try:
try:
opts, args = getopt.getopt(argv[1:], "ho:v", ["help", "output="])
except getopt.error, msg:
raise Usage(msg)
# option processing
for option, value in opts:
if option == "-v":
options['verbose'] = True
if option in ("-h", "--help"):
raise Usage(help_message)
if option in ("-o", "--output"):
options['outputfile'] = value
except Usage, err:
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
print >> sys.stderr, "\t for help use --help"
return 2
bytes = file(args[0], 'rb').read()
outfp = file(options['outputfile'], 'wb') if 'outputfile' in options else sys.stdout
bytes = bytes.decode('CP1252asUTF8gremlins')
outfp.write(bytes)
outfp.close()
if __name__ == "__main__":
sys.exit(main())


