feat: added parse_raw_specification

main
Ricard Illa 2023-06-22 09:40:26 +02:00
parent 5a4bca756e
commit 725114805a
3 changed files with 60 additions and 0 deletions

View File

@ -0,0 +1,47 @@
import xml.etree.ElementTree as ET
from typing import Dict
def iter_parse(root: ET.Element) -> Dict[str,str]:
"""Recursively parse the XML tree into a dictionary
Each key/value pair is inside it's own <div> tag and
the key inside a <b> tag.
The fields that I believe are compulsory (TCIN, UPC and Origin)
are only nested one level deep, while the rest of fields seem
to be always nested two levels deep. But parsing it recursively
helps generalise both cases."""
spec_dict = {}
for child in root:
if child.tag == "div":
if "b" in [x.tag for x in child]:
key, *values = child.itertext()
key = key.strip(":")
value = "".join(values).strip(":")
spec_dict[key] = value
else:
spec_dict.update(iter_parse(child))
return spec_dict
def parse_raw_specs(raw_specs: str) -> Dict[str,str]:
"""Parse a raw specifications XML string into a dictionary
This involves first recursively parsing the XML tree and then
renaming the key values"""
fields_mapping = {
"Material": "materials",
"Package Quantity": "packaging",
"Number of Pieces": "packaging",
"Dimensions (Overall)": "dimensions",
"Dimensions": "dimensions",
"Weight": "weight",
"TCIN": "tcin",
"Origin": "origin",
}
xml_root = ET.fromstring(raw_specs)
parsed = iter_parse(xml_root)
specs_dict = {
fields_mapping[key]: value
for key, value in parsed.items()
if key in fields_mapping
}
return specs_dict

View File

@ -2,3 +2,6 @@ input := "../../data/large_target_store_products_dataset_sample - large_target_s
run:
python -m main --input "{{ input }}"
test:
python -m pytest

View File

@ -0,0 +1,10 @@
from helpers import parse_raw_specs
def test_parse_raw_specs():
xml_str = """
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test="itemDetailsTabMarketplaceMessage"><p class="h-padding-t-x2">The above item details were provided by the Target Plus Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class="h-padding-t-x2">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item\'s label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>
"""
expected = {"tcin": " 81917300", "origin": " imported"}
assert parse_raw_specs(xml_str) == expected