Friday, May 20, 2011

python script to replace html Entities (&; codes)

This script converts html files in place, replacing html character entities with standard characters. I'm using this to clean up html entities unrecognized by the Anyview j2me javame midlet ebook reader for LG dlite gd570 phone.
# htmlcodefix.py (dlc 5/2011)
# replace html entity codes .  writes new html file in place.
import sys
import os
import re

trans = { "“" : "\"",
 " " : " ",
 "”" : "\"",
 "’" : "'", 
 "—" : "--", 
 }

def main():
 if len(sys.argv) != 2:
  print("Usage: " + sys.argv[0] + " {html file to convert in-place}")
  exit();
 filename = sys.argv[1]
 if(not filename.endswith(("htm", "html"))):
  print(filename + " not html!")
  exit();
 if(not os.path.exists(filename)): 
  print(filename + " not found!")
  exit();

 print("converting ... " + filename)
 with open(filename, 'r') as f:
  read_data = f.read()
 
 for (srch, repl) in trans.items():
  print (srch, repl)
  read_data = read_data.replace(srch, repl)

 print(read_data)

 with open(filename, mode='w') as f:
  f.write(read_data)



if __name__ == '__main__':
 main()

No comments:

Post a Comment