I downloaded some US census area file in KML format. You can download the file here. I am trying to grab the area name and the coordinate boundaries. For some reason, some of the coordinate fields are truncated and not read correctly. For example, the coordinates for "Bloomsburg-Berwick-Sunbury, PA" appears in the KML file as
<coordinates>-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.008418,40.659912,0.0 -76.996995,40.635778,0.0 -76.965528,40.647149,0.0 -76.944828,40.650209,0.0 -76.939883,40.638142,0.0 -76.949148,40.628167,0.0 -76.918672,40.603466,0.0 -76.886411,40.617758,0.0 -76.864254,40.627585,0.0 -76.840104,40.625439,0.0 -76.810269,40.634526,0.0 -76.810044,40.640102,0.0 -76.804867,40.646839,0.0 -76.793851,40.640514,0.0 -76.745894,40.654464,0.0 -76.701624,40.658082,0.0 -76.700546,40.663114,0.0 -76.662137,40.674013,0.0 -76.562175,40.709007,0.0 -76.469523,40.743188,0.0 -76.380334,40.775445,0.0 -76.30717,40.801809,0.0 -76.2991,40.831191,0.0 -76.284611,40.883588,0.0 -76.207827,40.94974,0.0 -76.231194,41.050168,0.0 -76.228975,41.138466,0.0 -76.277639,41.131804,0.0 -76.317953,41.205453,0.0 -76.319957,41.211255,0.0 -76.310261,41.310198,0.0 -76.407934,41.308418,0.0 -76.447597,41.275629,0.0 -76.592607,41.157765,0.0 -76.640767,41.155718,0.0 -76.678776,41.154172,0.0 -76.732672,41.17204,0.0 -76.790807,41.175732,0.0 -76.828168,41.16578,0.0 -76.880963,41.158044,0.0 -76.884245,41.157099,0.0 -76.885228,41.155973,0.0 -76.888145,41.153807,0.0 -76.889338,41.151988,0.0 -76.889669,41.150791,0.0 -76.896114,41.13907,0.0 -76.960229,41.148801,0.0 -76.977939,41.087883,0.0 -77.058088,41.085575,0.0 -77.113839,41.069032,0.0 -77.144111,41.06884,0.0 -77.14416,41.044338,0.0 -77.204027,40.99271,0.0 -77.279236,40.90971,0.0 -77.36418,40.846937,0.0</coordinates>
But is truncated at character 297 out of 1664. This happens seemingly randomly for others as well. Size doesn't seem to be an issue.
['-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.00841']
I tried on two different ec2 machines so I don't think it's a memory/hardware issue. Any idea what is going on?
from xml.sax.handler import ContentHandler
from xml.sax import parse
class KMLHandler(ContentHandler):
def __init__(self):
super().__init__()
self.place_names = []
self.current_name = None
self.coordinates = []
self.temp_coordinates = []
self.start_placemark = False
self.capture_place_name = False
self.capture_cordinates = False
self.mapping_dict = {}
def startElement(self, name, attrs):
if name == 'Placemark':
self.first_placemark = True
self.start_placemark = True
self.temp_coordinates = []
self.current_name = None
else:
pass
if name == "SimpleData":
if attrs['name'] == "NAME":
self.capture_place_name = True
if name == "coordinates":
self.capture_cordinates = True
def endElement(self, name):
if name == "Placemark":
self.start_placemark = False
self.coordinates.append(self.temp_coordinates)
self.mapping_dict[self.current_name] = self.temp_coordinates
def characters(self, content):
if content.strip() != "":
if self.capture_place_name == True:
self.place_names.append(content)
self.current_name = content
self.capture_place_name = False
if self.capture_cordinates == True:
str_vals = [x.split(',')[0:2] for x in content.split(' ')]
self.temp_coordinates.append(content)
self.capture_cordinates = False
fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'
handler = KMLHandler()
parse(fname, handler)
As indicated in the comments, each characters
event returns a chunk, which may or may not be the entire tag contents. It's similar to reading from a network; you might not get everything at once.
I reworked your code below, and it seems to report the right answer for Berwick. On my machine, the first chunk is 283 characters and the 2nd chunk is 1353 characters. 283 + 1353 = 1636, which matches the size of the data in the file.
Instead of a set of Booleans, I think it's simpler to capture the tag name, and then test for that when you're processing characters
. There's only one controlling value, and it's set & reset in one place.
I didn't see a need for temp_coordinates. It wasn't clear to me whether you want coordinates to be a list or what, exactly, so I just grab the string.
from xml.sax import parse
class KMLHandler(ContentHandler):
def __init__(self):
super().__init__()
self.place_names = []
self.current_name = None
self.coordinates = []
self.start_placemark = False
self.capture_place_name = False
self.mapping_dict = {}
self.capture = ''
def startElement(self, name, attrs):
self.capture = ''
if name == 'Placemark':
self.first_placemark = True
self.start_placemark = True
self.current_name = None
else:
pass
if name == "SimpleData":
if attrs['name'] == "NAME":
self.capture = name
if name == "coordinates":
self.capture = name
def endElement(self, name):
if name == "Placemark":
self.start_placemark = False
self.mapping_dict[self.current_name] = self.coordinates
self.coordinates = []
def characters(self, content):
if content.strip() != "":
if self.capture == 'SimpleData':
self.place_names.append(content)
self.current_name = content
self.capture_place_name = False
if self.capture == "coordinates":
self.coordinates.append(content)
print( '%d coordinates for %s: {%s}' % (len(content),
self.current_name,
self.coordinates) )
fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'
handler = KMLHandler()
parse(fname, handler)