feat: added parse_raw_specification
parent
5a4bca756e
commit
725114805a
|
@ -0,0 +1,47 @@
|
|||
import xml.etree.ElementTree as ET
|
||||
from typing import Dict
|
||||
|
||||
def iter_parse(root: ET.Element) -> Dict[str,str]:
|
||||
"""Recursively parse the XML tree into a dictionary
|
||||
Each key/value pair is inside it's own <div> tag and
|
||||
the key inside a <b> tag.
|
||||
The fields that I believe are compulsory (TCIN, UPC and Origin)
|
||||
are only nested one level deep, while the rest of fields seem
|
||||
to be always nested two levels deep. But parsing it recursively
|
||||
helps generalise both cases."""
|
||||
|
||||
spec_dict = {}
|
||||
for child in root:
|
||||
if child.tag == "div":
|
||||
if "b" in [x.tag for x in child]:
|
||||
key, *values = child.itertext()
|
||||
key = key.strip(":")
|
||||
value = "".join(values).strip(":")
|
||||
spec_dict[key] = value
|
||||
else:
|
||||
spec_dict.update(iter_parse(child))
|
||||
return spec_dict
|
||||
|
||||
def parse_raw_specs(raw_specs: str) -> Dict[str,str]:
|
||||
"""Parse a raw specifications XML string into a dictionary
|
||||
This involves first recursively parsing the XML tree and then
|
||||
renaming the key values"""
|
||||
|
||||
fields_mapping = {
|
||||
"Material": "materials",
|
||||
"Package Quantity": "packaging",
|
||||
"Number of Pieces": "packaging",
|
||||
"Dimensions (Overall)": "dimensions",
|
||||
"Dimensions": "dimensions",
|
||||
"Weight": "weight",
|
||||
"TCIN": "tcin",
|
||||
"Origin": "origin",
|
||||
}
|
||||
xml_root = ET.fromstring(raw_specs)
|
||||
parsed = iter_parse(xml_root)
|
||||
specs_dict = {
|
||||
fields_mapping[key]: value
|
||||
for key, value in parsed.items()
|
||||
if key in fields_mapping
|
||||
}
|
||||
return specs_dict
|
|
@ -2,3 +2,6 @@ input := "../../data/large_target_store_products_dataset_sample - large_target_s
|
|||
|
||||
run:
|
||||
python -m main --input "{{ input }}"
|
||||
|
||||
test:
|
||||
python -m pytest
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
from helpers import parse_raw_specs
|
||||
|
||||
|
||||
def test_parse_raw_specs():
|
||||
xml_str = """
|
||||
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test="itemDetailsTabMarketplaceMessage"><p class="h-padding-t-x2">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class="h-padding-t-x2">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item\'s label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>
|
||||
"""
|
||||
|
||||
expected = {"tcin": " 81917300", "origin": " imported"}
|
||||
assert parse_raw_specs(xml_str) == expected
|
Loading…
Reference in New Issue