feat: handle malformed xml file

main
Ricard Illa 2023-06-22 15:34:38 +02:00
parent 9b5ce6d36f
commit 0b58d47acc
1 changed files with 21 additions and 13 deletions

View File

@ -1,14 +1,15 @@
import logging
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from typing import Dict from typing import Dict
def iter_parse(root: ET.Element) -> Dict[str,str]:
"""Recursively parse the XML tree into a dictionary def iter_parse(root: ET.Element) -> Dict[str, str]:
Each key/value pair is inside it's own <div> tag and """Recursively parse the XML tree into a dictionary Each key/value pair is
the key inside a <b> tag. inside its own <div> tag and the key inside a <b> tag.
The fields that I believe are compulsory (TCIN, UPC and Origin) The fields that I believe are compulsory (TCIN, UPC and Origin) are only
are only nested one level deep, while the rest of fields seem nested one level deep, while the rest of fields seem to be always nested
to be always nested two levels deep. But parsing it recursively two levels deep. But parsing it recursively helps generalise both cases."""
helps generalise both cases."""
spec_dict = {} spec_dict = {}
for child in root: for child in root:
@ -22,10 +23,11 @@ def iter_parse(root: ET.Element) -> Dict[str,str]:
spec_dict.update(iter_parse(child)) spec_dict.update(iter_parse(child))
return spec_dict return spec_dict
def parse_raw_specs(raw_specs: str) -> Dict[str,str]:
"""Parse a raw specifications XML string into a dictionary def parse_raw_specs(raw_specs: str) -> Dict[str, str]:
This involves first recursively parsing the XML tree and then """Parse a raw specifications XML string into a dictionary.
renaming the key values""" This involves first recursively parsing the XML tree and then renaming
the key values"""
fields_mapping = { fields_mapping = {
"Material": "materials", "Material": "materials",
@ -37,7 +39,13 @@ def parse_raw_specs(raw_specs: str) -> Dict[str,str]:
"TCIN": "tcin", "TCIN": "tcin",
"Origin": "origin", "Origin": "origin",
} }
xml_root = ET.fromstring(raw_specs)
try:
xml_root = ET.fromstring(raw_specs)
except ET.ParseError:
logging.error("error parsing xml string: \n%s", raw_specs)
return {}
parsed = iter_parse(xml_root) parsed = iter_parse(xml_root)
specs_dict = { specs_dict = {
fields_mapping[key]: value fields_mapping[key]: value