dayrize-usecase/etl/tests/test_parse_xml.py

154 lines
12 KiB
Python
Raw Normal View History

2023-06-22 15:34:11 +02:00
"""Test the `parse_raw_specs` function and its helper `iter_parse`"""
2023-06-22 09:40:26 +02:00
2023-06-22 15:34:11 +02:00
import xml.etree.ElementTree as ET
from helpers.parse_xml import parse_raw_specs, iter_parse
2023-06-22 15:34:11 +02:00
def test_parse_raw_specs0():
"""Test an example XML string found in the sample date file"""
2023-06-22 09:40:26 +02:00
xml_str = """
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test="itemDetailsTabMarketplaceMessage"><p class="h-padding-t-x2">The above item details were provided by the Target Plus Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class="h-padding-t-x2">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item\'s label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>
"""
expected = {"tcin": " 81917300", "origin": " imported"}
assert parse_raw_specs(xml_str) == expected
2023-06-22 15:34:11 +02:00
def test_parse_raw_specs1():
"""Test an example XML string found in the sample date file"""
xml_str = """
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Number of Pieces:</b> 2</div><hr/></div><div><div><b>Weight:</b> 1 pounds</div><hr/></div><div><div><b>Maximum Height:</b> 33.5 inches</div><hr/></div><div><div><b>Minimum Height:</b> 33.5 inches</div><hr/></div><div><div><b>Material:</b> Resin</div><hr/></div><div><div><b>Battery:</b> No Battery Used</div><hr/></div><div><b>TCIN</b>: <!-- -->86383979<hr/></div><div><b>UPC</b>: <!-- -->023271234080<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test="itemDetailsTabMarketplaceMessage"><p class="h-padding-t-x2">The above item details were provided by the Target Plus Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class="h-padding-t-x2">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item\'s label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>
"""
expected = {
"Battery": " No Battery Used",
"Material": " Resin",
"Maximum Height": " 33.5 inches",
"Minimum Height": " 33.5 inches",
"Number of Pieces": " 2",
"Origin": " imported",
"TCIN": " 86383979",
"UPC": " 023271234080",
"Weight": " 1 pounds",
}
assert iter_parse(ET.fromstring(xml_str)) == expected
expected = {
"materials": " Resin",
"origin": " imported",
"packaging": " 2",
"tcin": " 86383979",
"weight": " 1 pounds",
}
assert parse_raw_specs(xml_str) == expected
def test_parse_raw_specs2():
"""Test an example XML string found in the sample date file"""
xml_str = """
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Dimensions (Overall):</b> 2.1 inches (H) x 9.0 inches (W) x 10.9 inches (D)</div><hr/></div><div><div><b>Party subtype:</b> Party Card Holders</div><hr/></div><div><div><b>Material:</b> Wood</div><hr/></div><div><b>TCIN</b>: <!-- -->82840486<hr/></div><div><b>UPC</b>: <!-- -->843128196602<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div class="h-padding-b-x2 h-display-flex"><span><span class="sc-gtsrHT kKuqfg" data-icon-name="AlertsLegal" fill="#000000" size="22"><div class="sc-hKFxyN kksiKu"><svg focusable="false" height="24" role="presentation" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><g fill="none"><path d="M12 2.73L1.75 20.148h20.497L11.999 2.729z" fill="#FF0" stroke="#000" stroke-linejoin="round" stroke-width="1.5"></path><path d="M12 16.123c.565 0 1.023.45 1.023 1.005 0 .556-.458 1.006-1.024 1.006-.566 0-1.024-.45-1.024-1.006 0-.555.458-1.005 1.024-1.005zm-.844-8.926a1.394 1.394 0 011.511-.104c.48.266.751.787.69 1.325l-.736 6.575a.586.586 0 01-.23.56.611.611 0 01-.61.069.591.591 0 01-.355-.492l-.763-6.526a1.334 1.334 0 01.493-1.407z" fill="#000"></path></g></svg></div></span></span><span class="h-padding-l-x2"><span class="h-text-bold">WARNING:</span><span> This product can expose you to chemical(s) including Formaldehyde (gas), which is known to the State of California to cause cancer and birth defects or other reproductive harm. For more information go to P65Warnings.ca.gov.</span><a class="Link__StyledLink-sc-4b9qcv-0 gCNFxQ h-text-bold h-text-underline" href="http://www.p65warnings.ca.gov/"> <!-- -->www.p65warnings.ca.gov</a></span></div><hr aria-hidden="true"/><div data-test="itemDetailsTabMarketplaceMessage"><p class="h-padding-t-x2">The above item details were provided by the Target Plus Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class="h-padding-t-x2">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item\'s label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>
"""
expected = {
"Dimensions (Overall)": " 2.1 inches (H) x 9.0 inches (W) x 10.9 inches (D)",
"Material": " Wood",
"Origin": " imported",
"Party subtype": " Party Card Holders",
"TCIN": " 82840486",
"UPC": " 843128196602",
}
assert iter_parse(ET.fromstring(xml_str)) == expected
expected = {
"dimensions": " 2.1 inches (H) x 9.0 inches (W) x 10.9 inches (D)",
"materials": " Wood",
"origin": " imported",
"tcin": " 82840486",
}
assert parse_raw_specs(xml_str) == expected
def test_parse_raw_specs3():
"""Test an example XML string found in the sample date file"""
xml_str = """
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Contains:</b> Does Not Contain Any of the 8 Major Allergens</div><hr/></div><div><div><b>Dietary Needs:</b> Gluten Free</div><hr/></div><div><div><b>Form:</b> Pieces</div><hr/></div><div><div><b>State of Readiness:</b> Ready to Eat</div><hr/></div><div><div><b>Package Quantity:</b> 1</div><hr/></div><div><div><b>Net weight:</b> 15.6 Ounces</div><hr/></div><div><b>TCIN</b>: <!-- -->54571204<hr/></div><div><b>UPC</b>: <!-- -->022000279729<hr/></div><div><b>Item Number (DPCI)</b>: <!-- -->055-02-1211<hr/></div><div><b>Origin</b>: <!-- -->Made in the USA or Imported<hr/></div><div><b>Grocery Disclaimer</b>:<!-- --> <div>Content on this site is for reference purposes only. Target does not represent or warrant that the nutrition, ingredient, allergen and other product information on our Web or Mobile sites are accurate or complete, since this information comes from the product manufacturers. On occasion, manufacturers may improve or change their product formulas and update their labels. We recommend that you do not rely solely on the information presented on our Web or Mobile sites and that you review the product\'s label or contact the manufacturer directly if you have specific product concerns or questions. If you have specific healthcare concerns or questions about the products displayed, please contact your licensed healthcare professional for advice or answers. Any additional pictures are suggested servings only.</div></div></div>
"""
expected = {
"Contains": " Does Not Contain Any of the 8 Major Allergens",
"Dietary Needs": " Gluten Free",
"Form": " Pieces",
"Grocery Disclaimer": (
" Content on this site is for reference purposes only. "
"Target does not represent or warrant that the "
"nutrition, ingredient, allergen and other product "
"information on our Web or Mobile sites are accurate or "
"complete, since this information comes from the "
"product manufacturers. On occasion, manufacturers may "
"improve or change their product formulas and update "
"their labels. We recommend that you do not rely "
"solely on the information presented on our Web or "
"Mobile sites and that you review the product's label "
"or contact the manufacturer directly if you have "
"specific product concerns or questions. If you have "
"specific healthcare concerns or questions about the "
"products displayed, please contact your licensed "
"healthcare professional for advice or answers. Any "
"additional pictures are suggested servings only."
),
"Item Number (DPCI)": " 055-02-1211",
"Net weight": " 15.6 Ounces",
"Origin": " Made in the USA or Imported",
"Package Quantity": " 1",
"State of Readiness": " Ready to Eat",
"TCIN": " 54571204",
"UPC": " 022000279729",
}
assert iter_parse(ET.fromstring(xml_str)) == expected
expected = {
"origin": " Made in the USA or Imported",
"packaging": " 1",
"tcin": " 54571204",
}
assert parse_raw_specs(xml_str) == expected
def test_parse_raw_specs4():
"""Test an example XML string found in the sample date file"""
xml_str = """
<div class="styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight" data-test="item-details-specifications"><h3 class="h-text-bs h-margin-b-tight">Specifications</h3><div><div><b>Suggested Age:</b> 22 Years and Up</div><hr/></div><div><div><b>Number of Pages:</b> 247</div><hr/></div><div><div><b>Genre:</b> Technology</div><hr/></div><div><div><b>Sub-Genre:</b> Agriculture</div><hr/></div><div><div><b>Format:</b> Hardcover</div><hr/></div><div><div><b>Publisher:</b> States Academic Press</div><hr/></div><div><div><b>Book theme:</b> Agronomy, Crop Science</div><hr/></div><div><div><b>Author:</b> Mark Taylor</div><hr/></div><div><div><b>Language:</b> English</div><hr/></div><div><b>Street Date</b>: <!-- -->March 8, 2022<hr/></div><div><b>TCIN</b>: <!-- -->84917947<hr/></div><div><b>UPC</b>: <!-- -->9781639893843<hr/></div><div><b>Item Number (DPCI)</b>: <!-- -->247-34-8919<hr/></div><div><b>Origin</b>: <!-- -->Made in the USA or Imported<hr/></div></div>
"""
expected = {
"Author": " Mark Taylor",
"Book theme": " Agronomy, Crop Science",
"Format": " Hardcover",
"Genre": " Technology",
"Item Number (DPCI)": " 247-34-8919",
"Language": " English",
"Number of Pages": " 247",
"Origin": " Made in the USA or Imported",
"Publisher": " States Academic Press",
"Street Date": " March 8, 2022",
"Sub-Genre": " Agriculture",
"Suggested Age": " 22 Years and Up",
"TCIN": " 84917947",
"UPC": " 9781639893843",
}
assert iter_parse(ET.fromstring(xml_str)) == expected
expected = {"origin": " Made in the USA or Imported", "tcin": " 84917947"}
assert parse_raw_specs(xml_str) == expected
def test_malformed_xml():
"""Test al maformed xml string"""
xml_str = "<div>foo"
2023-06-23 11:11:32 +02:00
assert parse_raw_specs(xml_str) is None