If you work with documents from many different sources, you’ve probably seen this before:
That’s good.
“Oh no”, you think, “A utf-8 encoding problem.” That three-letter combo should be a single close-quote, like this:
That’s good.
Sometimes the problem is that your application is reading the file as a win-1252 (or cp1252, or the kinda-sorta iso-8859-1 used on the web). In this case the solution is easy: instruct your application to reopen the file as utf8.
But sometimes, your file really does say “’”, even when decoded as utf-8. How this happens is that someone took some utf8 text, pasted it into a win1252 document, and then saved the document as utf8. So now the bytes in your document are:
That
[c3][a2][e2][82][ac][e2][84][a2]
s good
instead of
That
[e2][80][99]
s good.
So how do you fix it?
I wrote a tool.
The Python code below uses Python’s codec interface to register a simple stateless encoder that turns these utf8 gremlin bytes back into pure utf8 bytes. You can use it from the command line like removeUTF8Gremlins.py infile.txt -o outfile.txt
or you can use it as a library by importing it and then using the CP1252asUTF8gremlins
pseudo-codec anywhere you can use a stateless codec.
#!/usr/bin/env python # encoding: utf-8 # BSD LICENSE # Copyright (c) 2010, Dancing Mammoth Inc # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # Redistributions in binary form must reproduce the above copyright notice, this # list of conditions and the following disclaimer in the documentation and/or # other materials provided with the distribution. # # Neither the name of Dancing Mammoth nor the names of its contributors may # be used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ removeUTF8Gremlins.py Will recode file with utf8 gremlins to a proper utf8 file. When used as a library, will register the codec 'CP1252asUTF8gremlins', which provides a stateless decoder which will convert bytes with gremlins into pure utf8 bytes. We call a 'utf8 gremlin' a byte sequence that results when a utf8 byte is read as a cp1252 encoding into unicode chars, and then written out as utf8. The tell-tale sign of it is bytes that look like this in a file read as utf8. Original: That’s good. Bytes as utf8: That[e2][80][99]s good. When read as CP1252: That’s good. (acute lowercase a, euro symbol, trademark symbol) Bytes as utf8 gremlins: That[c3][a2][e2][82][ac][e2][84][a2]s good. This utility turns "Bytes as utf8 gremlins" back into "Bytes as utf8" Created by Francis Avila on 2010-10-27. Copyright (c) 2010 Dancing Mammoth, Inc. All rights reserved. """ import sys import getopt import codecs import re help_message = ''' Fix a conversion error where a utf8 file got interpreted as a win1252 file and then saved as utf8, producing three-character multibyte gremlins. ''' def win1252_to_utf8_gremlin_table(mapping={}): if mapping: return mapping def makemapping(mapping): for i in range(256): byte = ('%02x' % i).decode('hex_codec') try: cp1252uni = byte.decode('cp1252') except UnicodeDecodeError: cp1252uni = byte.decode('iso-8859-1') if cp1252uni: realutf8 = cp1252uni.encode('utf-8') try: asuni = realutf8.decode('cp1252') except UnicodeDecodeError: asuni = realutf8.decode('iso-8859-1') if asuni: utf8gremlin = asuni.encode('utf8') mapping[utf8gremlin] = realutf8 makemapping(mapping) return mapping def win1252_to_utf8_gremlin_re(): mapping = win1252_to_utf8_gremlin_table() rechars = [] for k,v in mapping.items(): if k != v: rechars.append(k.encode('string_escape')) regex = '(?:%s)' % '|'.join(rechars) return re.compile(regex) def reverse_win1252_to_utf8_gremlins(bytes, errors='strict'): regex = win1252_to_utf8_gremlin_re() mapping = win1252_to_utf8_gremlin_table() def replace(mo): try: newchar = mapping[mo.group(0)] except KeyError: if errors=='strict': raise ValueError('Encountered bytes with no pure utf8 equivalent.') else: if errors=='ignore': newchar = '' elif errors=='replace': newchar = '?' return newchar newbytes = re.sub(regex, replace, bytes) return (newbytes, len(bytes)) def register_win1252_to_utf8_gremlins(encoding): ci = None if encoding == 'cp1252asutf8gremlins': ci = codecs.CodecInfo(None, reverse_win1252_to_utf8_gremlins, name='CP1252asUTF8gremlins') return ci codecs.register(register_win1252_to_utf8_gremlins) class Usage(Exception): def __init__(self, msg): self.msg = msg def main(argv=None): if argv is None: argv = sys.argv options = {} try: try: opts, args = getopt.getopt(argv[1:], "ho:v", ["help", "output="]) except getopt.error, msg: raise Usage(msg) # option processing for option, value in opts: if option == "-v": options['verbose'] = True if option in ("-h", "--help"): raise Usage(help_message) if option in ("-o", "--output"): options['outputfile'] = value except Usage, err: print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) print >> sys.stderr, "\t for help use --help" return 2 bytes = file(args[0], 'rb').read() outfp = file(options['outputfile'], 'wb') if 'outputfile' in options else sys.stdout bytes = bytes.decode('CP1252asUTF8gremlins') outfp.write(bytes) outfp.close() if __name__ == "__main__": sys.exit(main())