feat: handle malformed xml file
parent
9b5ce6d36f
commit
0b58d47acc
|
@ -1,14 +1,15 @@
|
||||||
|
import logging
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
def iter_parse(root: ET.Element) -> Dict[str,str]:
|
|
||||||
"""Recursively parse the XML tree into a dictionary
|
def iter_parse(root: ET.Element) -> Dict[str, str]:
|
||||||
Each key/value pair is inside it's own <div> tag and
|
"""Recursively parse the XML tree into a dictionary Each key/value pair is
|
||||||
the key inside a <b> tag.
|
inside its own <div> tag and the key inside a <b> tag.
|
||||||
The fields that I believe are compulsory (TCIN, UPC and Origin)
|
The fields that I believe are compulsory (TCIN, UPC and Origin) are only
|
||||||
are only nested one level deep, while the rest of fields seem
|
nested one level deep, while the rest of fields seem to be always nested
|
||||||
to be always nested two levels deep. But parsing it recursively
|
two levels deep. But parsing it recursively helps generalise both cases."""
|
||||||
helps generalise both cases."""
|
|
||||||
|
|
||||||
spec_dict = {}
|
spec_dict = {}
|
||||||
for child in root:
|
for child in root:
|
||||||
|
@ -22,10 +23,11 @@ def iter_parse(root: ET.Element) -> Dict[str,str]:
|
||||||
spec_dict.update(iter_parse(child))
|
spec_dict.update(iter_parse(child))
|
||||||
return spec_dict
|
return spec_dict
|
||||||
|
|
||||||
def parse_raw_specs(raw_specs: str) -> Dict[str,str]:
|
|
||||||
"""Parse a raw specifications XML string into a dictionary
|
def parse_raw_specs(raw_specs: str) -> Dict[str, str]:
|
||||||
This involves first recursively parsing the XML tree and then
|
"""Parse a raw specifications XML string into a dictionary.
|
||||||
renaming the key values"""
|
This involves first recursively parsing the XML tree and then renaming
|
||||||
|
the key values"""
|
||||||
|
|
||||||
fields_mapping = {
|
fields_mapping = {
|
||||||
"Material": "materials",
|
"Material": "materials",
|
||||||
|
@ -37,7 +39,13 @@ def parse_raw_specs(raw_specs: str) -> Dict[str,str]:
|
||||||
"TCIN": "tcin",
|
"TCIN": "tcin",
|
||||||
"Origin": "origin",
|
"Origin": "origin",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
xml_root = ET.fromstring(raw_specs)
|
xml_root = ET.fromstring(raw_specs)
|
||||||
|
except ET.ParseError:
|
||||||
|
logging.error("error parsing xml string: \n%s", raw_specs)
|
||||||
|
return {}
|
||||||
|
|
||||||
parsed = iter_parse(xml_root)
|
parsed = iter_parse(xml_root)
|
||||||
specs_dict = {
|
specs_dict = {
|
||||||
fields_mapping[key]: value
|
fields_mapping[key]: value
|
||||||
|
|
Loading…
Reference in New Issue