# htmlcodefix.py (dlc 5/2011)
# replace html entity codes . writes new html file in place.
import sys
import os
import re
trans = { "“" : "\"",
" " : " ",
"”" : "\"",
"’" : "'",
"—" : "--",
}
def main():
if len(sys.argv) != 2:
print("Usage: " + sys.argv[0] + " {html file to convert in-place}")
exit();
filename = sys.argv[1]
if(not filename.endswith(("htm", "html"))):
print(filename + " not html!")
exit();
if(not os.path.exists(filename)):
print(filename + " not found!")
exit();
print("converting ... " + filename)
with open(filename, 'r') as f:
read_data = f.read()
for (srch, repl) in trans.items():
print (srch, repl)
read_data = read_data.replace(srch, repl)
print(read_data)
with open(filename, mode='w') as f:
f.write(read_data)
if __name__ == '__main__':
main()
My tech exploits. Nah... closer to a tech diary, and work log, so if I forget something, but know I forgot, I'll maybe be able to look it up here. And as a bonus, perhaps someone out there will find it useful or interesting. As is only fair, being nearly all the information put here was gleaned from elsewhere on the web.
Friday, May 20, 2011
python script to replace html Entities (&; codes)
This script converts html files in place, replacing html character entities with standard characters. I'm using this to clean up html entities unrecognized by the Anyview j2me javame midlet ebook reader for LG dlite gd570 phone.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment