¿Cómo usar el analizador de saxs xml para leer y escribir un xml grande?

Estoy intentando eliminar todos los nodos project1 (junto con sus elementos secundarios) del documento xml de muestra siguiente (el documento original es de aproximadamente 30 GB ) usando el analizador SAX. Sería correcto tener un archivo modificado por separado o bien con el en -Línea de edición.

sample.xml

  Hi This is old data  ty    

Aquí está mi bash ..

parser.py

 from xml.sax.handler import ContentHandler import xml.sax class MyHandler(xml.sax.handler.ContentHandler): def __init__(self, out_file): self._charBuffer = [] self._result = [] self._out = open(out_file, 'w') def _createElement(self, name, attrs): attributes = attrs.items() if attributes: out = '' for key, value in attributes: out += ' {}={}'.format(key, value) return ''.format(name, out) return ''.format(name) def _getCharacterData(self): data = ''.join(self._charBuffer).strip() self._charBuffer = [] self._out.write(data.strip()) #remove strip() if whitespace is important def parse(self, f): xml.sax.parse(f, self) def characters(self, data): self._charBuffer.append(data) def startElement(self, name, attrs): if not name == 'project1': self._result.append({}) self._out.write(self._createElement(name, attrs)) def endElement(self, name): if not name == 'project1': self._result[-1][name] = self._getCharacterData() MyHandler('out.xml').parse("sample.xml") 

No puedo hacerlo funcionar.

Podría usar una implementación xml.sax.saxutils.XMLFilterBase para filtrar los nodos project1.

En lugar de ensamblar las cadenas xml usted mismo podría usar xml.sax.saxutils.XMLGenerator .

El siguiente es el código de Python3, ajuste super si necesita Python2.

 from xml.sax import make_parser from xml.sax.saxutils import XMLFilterBase, XMLGenerator class Project1Filter(XMLFilterBase): """This decides which SAX events to forward to the ContentHandler We will not forward events when we are inside any elements with a name specified in the 'tags_names_to_exclude' parameter """ def __init__(self, tag_names_to_exclude, parent=None): super().__init__(parent) # set of tag names to exclude self._tag_names_to_exclude = tag_names_to_exclude # _project_1_count keeps track of opened project1 elements self._project_1_count = 0 def _forward_events(self): # will return True when we are not inside a project1 element return self._project_1_count == 0 def startElement(self, name, attrs): if name in self._tag_names_to_exclude: self._project_1_count += 1 if self._forward_events(): super().startElement(name, attrs) def endElement(self, name): if self._forward_events(): super().endElement(name) if name in self._tag_names_to_exclude: self._project_1_count -= 1 def characters(self, content): if self._forward_events(): super().characters(content) # override other content handler methods on XMLFilterBase as neccessary def main(): tag_names_to_exclude = {'project1', 'project2', 'project3'} reader = Project1Filter(tag_names_to_exclude, make_parser()) with open('out-small.xml', 'w') as f: handler = XMLGenerator(f) reader.setContentHandler(handler) reader.parse('input.xml') if __name__ == "__main__": main()