If you work with documents from many different sources, you’ve probably seen this before:
That’s good.
“Oh no”, you think, “A utf-8 encoding problem.” That three-letter combo should be a single close-quote, like this:
That’s good.
Sometimes the problem is that your application is reading the file as a win-1252 (or cp1252, or the kinda-sorta iso-8859-1 used on the web). In this case the solution is easy: instruct your application to reopen the file as utf8.
But sometimes, your file really does say “’”, even when decoded as utf-8. How this happens is that someone took some utf8 text, pasted it into a win1252 document, and then saved the document as utf8. So now the bytes in your document are:
That
[c3][a2][e2][82][ac][e2][84][a2]
s good
instead of
That
[e2][80][99]
s good.
So how do you fix it?
I wrote a tool.
The Python code below uses Python’s codec interface to register a simple stateless encoder that turns these utf8 gremlin bytes back into pure utf8 bytes. You can use it from the command line like removeUTF8Gremlins.py infile.txt -o outfile.txt
or you can use it as a library by importing it and then using the CP1252asUTF8gremlins
pseudo-codec anywhere you can use a stateless codec.
1 | #!/usr/bin/env python |
2 | # encoding: utf-8 |
3 |
4 | # BSD LICENSE |
5 | # Copyright (c) 2010, Dancing Mammoth Inc |
6 | # All rights reserved. |
7 | # |
8 | # Redistribution and use in source and binary forms, with or without |
9 | # modification, are permitted provided that the following conditions are met: |
10 | # |
11 | # Redistributions of source code must retain the above copyright notice, this |
12 | # list of conditions and the following disclaimer. |
13 | # |
14 | # Redistributions in binary form must reproduce the above copyright notice, this |
15 | # list of conditions and the following disclaimer in the documentation and/or |
16 | # other materials provided with the distribution. |
17 | # |
18 | # Neither the name of Dancing Mammoth nor the names of its contributors may |
19 | # be used to endorse or promote products derived from this software without |
20 | # specific prior written permission. |
21 | # |
22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
23 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
25 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
26 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
27 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
28 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
29 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
30 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 |
33 | """ |
34 | removeUTF8Gremlins.py |
35 |
36 | Will recode file with utf8 gremlins to a proper utf8 file. |
37 |
38 | When used as a library, will register the codec 'CP1252asUTF8gremlins', which |
39 | provides a stateless decoder which will convert bytes with gremlins into pure |
40 | utf8 bytes. |
41 |
42 | We call a 'utf8 gremlin' a byte sequence that results when a utf8 byte is read |
43 | as a cp1252 encoding into unicode chars, and then written out as utf8. |
44 |
45 | The tell-tale sign of it is bytes that look like this in a file read as utf8. |
46 |
47 | Original: That’s good. |
48 | Bytes as utf8: That[e2][80][99]s good. |
49 | When read as CP1252: That’s good. (acute lowercase a, euro symbol, trademark symbol) |
50 | Bytes as utf8 gremlins: That[c3][a2][e2][82][ac][e2][84][a2]s good. |
51 |
52 | This utility turns "Bytes as utf8 gremlins" back into "Bytes as utf8" |
53 |
54 | Created by Francis Avila on 2010-10-27. |
55 | Copyright (c) 2010 Dancing Mammoth, Inc. All rights reserved. |
56 | """ |
57 |
58 | import sys |
59 | import getopt |
60 | import codecs |
61 | import re |
62 |
63 | help_message = ''' |
64 | Fix a conversion error where a utf8 file got interpreted as a win1252 file |
65 | and then saved as utf8, producing three-character multibyte gremlins. |
66 | ''' |
67 |
68 | def win1252_to_utf8_gremlin_table(mapping = {}): |
69 | if mapping: |
70 | return mapping |
71 | def makemapping(mapping): |
72 | for i in range ( 256 ): |
73 | byte = ( '%02x' % i).decode( 'hex_codec' ) |
74 | try : |
75 | cp1252uni = byte.decode( 'cp1252' ) |
76 | except UnicodeDecodeError: |
77 | cp1252uni = byte.decode( 'iso-8859-1' ) |
78 |
79 | if cp1252uni: |
80 | realutf8 = cp1252uni.encode( 'utf-8' ) |
81 | try : |
82 | asuni = realutf8.decode( 'cp1252' ) |
83 | except UnicodeDecodeError: |
84 | asuni = realutf8.decode( 'iso-8859-1' ) |
85 | if asuni: |
86 | utf8gremlin = asuni.encode( 'utf8' ) |
87 | mapping[utf8gremlin] = realutf8 |
88 | makemapping(mapping) |
89 | return mapping |
90 |
91 | def win1252_to_utf8_gremlin_re(): |
92 | mapping = win1252_to_utf8_gremlin_table() |
93 | rechars = [] |
94 | for k,v in mapping.items(): |
95 | if k ! = v: |
96 | rechars.append(k.encode( 'string_escape' )) |
97 | regex = '(?:%s)' % '|' .join(rechars) |
98 | return re. compile (regex) |
99 |
100 | def reverse_win1252_to_utf8_gremlins(bytes, errors = 'strict' ): |
101 | regex = win1252_to_utf8_gremlin_re() |
102 | mapping = win1252_to_utf8_gremlin_table() |
103 | def replace(mo): |
104 | try : |
105 | newchar = mapping[mo.group( 0 )] |
106 | except KeyError: |
107 | if errors = = 'strict' : |
108 | raise ValueError( 'Encountered bytes with no pure utf8 equivalent.' ) |
109 | else : |
110 | if errors = = 'ignore' : |
111 | newchar = '' |
112 | elif errors = = 'replace' : |
113 | newchar = '?' |
114 | return newchar |
115 | newbytes = re.sub(regex, replace, bytes) |
116 | return (newbytes, len (bytes)) |
117 |
118 | def register_win1252_to_utf8_gremlins(encoding): |
119 | ci = None |
120 | if encoding = = 'cp1252asutf8gremlins' : |
121 | ci = codecs.CodecInfo( None , reverse_win1252_to_utf8_gremlins, name = 'CP1252asUTF8gremlins' ) |
122 | return ci |
123 |
124 | codecs.register(register_win1252_to_utf8_gremlins) |
125 |
126 | class Usage(Exception): |
127 | def __init__( self , msg): |
128 | self .msg = msg |
129 |
130 | def main(argv = None ): |
131 | if argv is None : |
132 | argv = sys.argv |
133 | options = {} |
134 | try : |
135 | try : |
136 | opts, args = getopt.getopt(argv[ 1 :], "ho:v" , [ "help" , "output=" ]) |
137 | except getopt.error, msg: |
138 | raise Usage(msg) |
139 |
140 | # option processing |
141 | for option, value in opts: |
142 | if option = = "-v" : |
143 | options[ 'verbose' ] = True |
144 | if option in ( "-h" , "--help" ): |
145 | raise Usage(help_message) |
146 | if option in ( "-o" , "--output" ): |
147 | options[ 'outputfile' ] = value |
148 |
149 | except Usage, err: |
150 | print >> sys.stderr, sys.argv[ 0 ].split( "/" )[ - 1 ] + ": " + str (err.msg) |
151 | print >> sys.stderr, "\t for help use --help" |
152 | return 2 |
153 |
154 | bytes = file (args[ 0 ], 'rb' ).read() |
155 | outfp = file (options[ 'outputfile' ], 'wb' ) if 'outputfile' in options else sys.stdout |
156 | bytes = bytes.decode( 'CP1252asUTF8gremlins' ) |
157 | outfp.write(bytes) |
158 | outfp.close() |
159 |
160 | if __name__ = = "__main__" : |
161 | sys.exit(main()) |