dayrize-usecase/notebooks/exploration.ipynb

1065 lines
50 KiB
Plaintext
Raw Normal View History

2023-06-21 15:46:28 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 410,
"id": "98ded03d-0208-4416-a5e5-720a2e0742fa",
"metadata": {},
"outputs": [],
"source": [
"import functools\n",
"import pandas as pd\n",
"from IPython.display import display, HTML\n",
"\n",
"in_file = \"/home/jovyan/data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv\"\n",
"data = pd.read_csv(in_file)\n",
"\n",
"def look_for_matches(data: pd.DataFrame, pattern: str, colname : str = \"raw_specifications\") -> str:\n",
" \"\"\"Useful for finding cells in raw_specifications containing a given string\"\"\"\n",
" return data.loc[data.loc[:, colname].str.contains(pattern), colname].iloc[0]\n",
"\n",
"def render_html(html: str):\n",
" \"\"\"Render an html string\"\"\"\n",
" display(HTML(html))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c20c14ea-9ef6-4d40-8b7d-4731d0866239",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['title', 'url', 'brand', 'main_image', 'sku', 'description',\n",
" 'raw_description', 'gtin13', 'currency', 'price', 'availability',\n",
" 'availableDeliveryMethod', 'available_branch', 'primary_category',\n",
" 'sub_category_1', 'sub_category_2', 'sub_category_3', 'images',\n",
" 'raw_specifications', 'specifications', 'highlights', 'raw_highlights',\n",
" 'uniq_id', 'scraped_at'],\n",
" dtype='object')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.axes[1]"
]
},
{
"cell_type": "code",
"execution_count": 132,
"id": "165723d1-8152-4e30-b25a-cbca7f4935f9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(data.loc[0, 'raw_specifications'])"
]
},
{
"cell_type": "code",
"execution_count": 134,
"id": "d153a732-f18c-4fb7-9282-9c02c49c3cc0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 11.9 inches (L), 11.9 inches (W)</div><hr/></div><div><div><b>Dated format:</b> Monthly</div><hr/></div><div><div><b>Calendar year:</b> 2022</div><hr/></div><div><div><b>Material:</b> Paper</div><hr/></div><div><b>TCIN</b>: <!-- -->84821007<hr/></div><div><b>UPC</b>: <!-- -->9781801433983<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Material\"))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "b06e66af-e0e3-466c-8797-546d5e076f32",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Contains:</b> Does Not Contain Any of the 8 Major Allergens</div><hr/></div><div><div><b>Dietary Needs:</b> Gluten Free</div><hr/></div><div><div><b>Form:</b> Pieces</div><hr/></div><div><div><b>State of Readiness:</b> Ready to Eat</div><hr/></div><div><div><b>Package Quantity:</b> 1</div><hr/></div><div><div><b>Net weight:</b> 15.6 Ounces</div><hr/></div><div><b>TCIN</b>: <!-- -->54571204<hr/></div><div><b>UPC</b>: <!-- -->022000279729<hr/></div><div><b>Item Number (DPCI)</b>: <!-- -->055-02-1211<hr/></div><div><b>Origin</b>: <!-- -->Made in the USA or Imported<hr/></div><div><b>Grocery Disclaimer</b>:<!-- --> <div>Content on this site is for reference purposes only. Target does not represent or warrant that the nutrition, ingredient, allergen and other product information on our Web or Mobile sites are accurate or complete, since this information comes from the product manufacturers. On occasion, manufacturers may improve or change their product formulas and update their labels. We recommend that you do not rely solely on the information presented on our Web or Mobile sites and that you review the product's label or contact the manufacturer directly if you have specific product concerns or questions. If you have specific healthcare concerns or questions about the products displayed, please contact your licensed healthcare professional for advice or answers. Any additional pictures are suggested servings only.</div></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Package\"))"
]
},
{
"cell_type": "code",
"execution_count": 137,
"id": "015a0d99-b4c2-47cf-9e23-59485a450470",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 23 inches (H) x 1 inches (W) x 23 inches (D)</div><hr/></div><div><div><b>Weight:</b> 4.65 pounds</div><hr/></div><div><div><b>Art subject:</b> Geometric Shapes</div><hr/></div><div><div><b>Orientation:</b> Vertical</div><hr/></div><div><div><b>Material:</b> Metal</div><hr/></div><div><div><b>Battery:</b> No Battery Used</div><hr/></div><div><b>TCIN</b>: <!-- -->86345566<hr/></div><div><b>UPC</b>: <!-- -->023271231140<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Weight\"))"
]
},
{
"cell_type": "code",
"execution_count": 138,
"id": "99cbfba8-4324-4d2a-b2e2-d766a5e57964",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 11.9 inches (L), 11.9 inches (W)</div><hr/></div><div><div><b>Dated format:</b> Monthly</div><hr/></div><div><div><b>Calendar year:</b> 2022</div><hr/></div><div><div><b>Material:</b> Paper</div><hr/></div><div><b>TCIN</b>: <!-- -->84821007<hr/></div><div><b>UPC</b>: <!-- -->9781801433983<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Dimensions\"))"
]
},
{
"cell_type": "code",
"execution_count": 139,
"id": "6c05e1db-7181-422e-924b-e67a28886abe",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"TCIN\"))"
]
},
{
"cell_type": "code",
"execution_count": 140,
"id": "bf3a07d9-11c0-48c4-be03-373a3ba5f755",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Origin\"))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "ffdb3244-a20d-427b-9c23-4d8c6cf4ce6d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Material\n",
"raw_specifications\n",
"True 105\n",
"False 62\n",
"Name: count, dtype: int64\n",
"\n",
"Package Quantity\n",
"raw_specifications\n",
"False 143\n",
"True 24\n",
"Name: count, dtype: int64\n",
"\n",
"Weight\n",
"raw_specifications\n",
"False 106\n",
"True 61\n",
"Name: count, dtype: int64\n",
"\n",
"Dimensions\n",
"raw_specifications\n",
"True 108\n",
"False 59\n",
"Name: count, dtype: int64\n",
"\n",
"TCIN\n",
"raw_specifications\n",
"True 167\n",
"Name: count, dtype: int64\n",
"\n",
"Origin\n",
"raw_specifications\n",
"True 167\n",
"Name: count, dtype: int64\n",
"\n"
]
}
],
"source": [
"# let's see how many of the specifications are there\n",
"\n",
"colname = \"raw_specifications\"\n",
"patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n",
"\n",
"for pattern in patterns:\n",
" print(pattern)\n",
" print(data.loc[:, colname].str.contains(pattern).value_counts())\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "dc44956b-ea8c-44ce-8bc5-e0472273e7a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"raw_specifications\n",
"False 162\n",
"True 5\n",
"Name: count, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# let's see how many of the specifications are there\n",
"\n",
"colname = \"raw_specifications\"\n",
"patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n",
"\n",
"sels = [data.loc[:, colname].str.contains(pattern) for pattern in patterns]\n",
"sel = functools.reduce(lambda x, y: x & y, sels)\n",
"\n",
"sel.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "107da59a-4bd8-4b0c-9e6e-95f38788eaf3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> .1 inches (H) x 6.0 inches (W) x 13.5 inches (D)</div><hr/></div><div><div><b>Weight:</b> .35 ounces</div><hr/></div><div><div><b>Package Quantity:</b> 3</div><hr/></div><div><div><b>Material:</b> Wood</div><hr/></div><div><b>TCIN</b>: <!-- -->82555842<hr/></div><div><b>UPC</b>: <!-- -->843128185798<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div class=\"h-padding-b-x2 h-display-flex\"><span><span class=\"sc-gtsrHT kKuqfg\" data-icon-name=\"AlertsLegal\" fill=\"#000000\" size=\"22\"><div class=\"sc-hKFxyN kksiKu\"><svg focusable=\"false\" height=\"24\" role=\"presentation\" viewbox=\"0 0 24 24\" width=\"24\" xmlns=\"http://www.w3.org/2000/svg\"><g fill=\"none\"><path d=\"M12 2.73L1.75 20.148h20.497L11.999 2.729z\" fill=\"#FF0\" stroke=\"#000\" stroke-linejoin=\"round\" stroke-width=\"1.5\"></path><path d=\"M12 16.123c.565 0 1.023.45 1.023 1.005 0 .556-.458 1.006-1.024 1.006-.566 0-1.024-.45-1.024-1.006 0-.555.458-1.005 1.024-1.005zm-.844-8.926a1.394 1.394 0 011.511-.104c.48.266.751.787.69 1.325l-.736 6.575a.586.586 0 01-.23.56.611.611 0 01-.61.069.591.591 0 01-.355-.492l-.763-6.526a1.334 1.334 0 01.493-1.407z\" fill=\"#000\"></path></g></svg></div></span></span><span class=\"h-padding-l-x2\"><span class=\"h-text-bold\">WARNING:</span><span>⚠ This product can expose you to chemicals including Formaldehyde (gas), which is known to the State of California to cause cancer and birth defects or other reproductive harm. For more information, go to www.P65Warnings.ca.gov</span><a class=\"Link__StyledLink-sc-4b9qcv-0 gCNFxQ h-text-bold h-text-underline\" href=\"http://www.p65warnings.ca.gov/\"> <!-- -->www.p65warnings.ca.gov</a></span></div><hr aria-hidden=\"true\"/><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 9.02 inches (L), 4.88 inches (H) x 5.0 inches (W)</div><hr/></div><div><div><b>Weight:</b> 1.63 pounds</div><hr/></div><div><div><b>Package Quantity:</b> 100</div><hr/></div><div><div><b>Material:</b> Paper</div><hr/></div><div><b>TCIN</b>: <!-- -->84236733<hr/></div><div><b>UPC</b>: <!-- -->194425198586<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 1.14 inches (H) x 5.04 inches (W) x 6.3 inches (D)</div><hr/></div><div><div><b>Weight:</b> .15 ounces</div><hr/></div><div><div><b>Package Quantity:</b> 12</div><hr/></div><div><div><b>Material:</b> Wood</div><hr/></div><div><b>TCIN</b>: <!-- -->82021299<hr/></div><div><b>UPC</b>: <!-- -->194425108806<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 9.79 inches (L), 4.13 inches (W)</div><hr/></div><div><div><b>Weight:</b> 2.61 ounces</div><hr/></div><div><div><b>Closure Type:</b> Flap</div><hr/></div><div><div><b>Package Quantity:</b> 250</div><hr/></div><div><div><b>Material:</b> Paper</div><hr/></div><div><b>TCIN</b>: <!-- -->81501830<hr/></div><div><b>UPC</b>: <!-- -->083514896277<hr/></div><div><b>Origin</b>: <!-- -->made in the USA<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 2.24 inches (L), 6.0 inches (W)</div><hr/></div><div><div><b>Weight:</b> 1.6 ounces</div><hr/></div><div><div><b>Closure Type:</b> Flap</div><hr/></div><div><div><b>Package Quantity:</b> 100</div><hr/></div><div><div><b>Material:</b> Cardboard</div><hr/></div><div><b>TCIN</b>: <!-- -->81843685<hr/></div><div><b>UPC</b>: <!-- -->087547421000<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://www.target.com/p/genie-crafts-wood-quote-signs-eat-drink-love-wood-letter-signs-drawing-stencils-wall-decor/-/A-82555842\n",
"https://www.target.com/p/sparkle-and-bash-100-pack-gold-foil-initial-letter-k-white-monogram-paper-napkins-for-dinner-party-4-x-8-in/-/A-84236733\n",
"https://www.target.com/p/wooden-rectangles-for-crafts-panel-board-4-x-6-in-12-pack/-/A-82021299\n",
"https://www.target.com/p/southworth-25-cotton-10-business-envelope-ivory-24-lbs-wove-250-box-fsc-j404i10/-/A-81501830\n",
"https://www.target.com/p/universal-self-seal-catalog-envelope-6-x-9-white-100-box-42100/-/A-81843685\n"
]
}
],
"source": [
"for x in data.loc[sel, \"raw_specifications\"]:\n",
" render_html(x)\n",
"for x in data.loc[sel, \"url\"]:\n",
" print(x)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "3853dd07-1977-44c6-bc97-2ecdaa98980e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"raw_specifications\n",
"False 124\n",
"True 43\n",
"Name: count, dtype: int64"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colname = \"raw_specifications\"\n",
"patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n",
"\n",
"(data.loc[:, colname].str.contains(\"Package Quantity\") | data.loc[:, colname].str.contains(\"Number of Pieces\")).value_counts()\n",
"\n",
"# Package Quantity and Number of Pieces are never found together. Maybe they refer to the same thing?"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "be8a7628-9199-4f25-a533-d041a7ff540c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>url</th>\n",
" <th>brand</th>\n",
" <th>main_image</th>\n",
" <th>sku</th>\n",
" <th>description</th>\n",
" <th>raw_description</th>\n",
" <th>gtin13</th>\n",
" <th>currency</th>\n",
" <th>price</th>\n",
" <th>...</th>\n",
" <th>sub_category_1</th>\n",
" <th>sub_category_2</th>\n",
" <th>sub_category_3</th>\n",
" <th>images</th>\n",
" <th>raw_specifications</th>\n",
" <th>specifications</th>\n",
" <th>highlights</th>\n",
" <th>raw_highlights</th>\n",
" <th>uniq_id</th>\n",
" <th>scraped_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>NCAA Illinois Fighting Illini Circo Cheese Cut...</td>\n",
" <td>https://www.target.com/p/ncaa-illinois-fightin...</td>\n",
" <td>NCAA</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>79646040</td>\n",
" <td>Reach out to the complex cheese lover in your ...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>99967205276</td>\n",
" <td>USD</td>\n",
" <td>58.95</td>\n",
" <td>...</td>\n",
" <td>Sports Fan Shop</td>\n",
" <td>Sports Fan Shop Home Goods</td>\n",
" <td>Sports Fan Shop Barware &amp; Drinkware</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Number of Pieces: 5 | Number of Pieces: 5 | We...</td>\n",
" <td>BEAUTY &amp; ELEGANCE - The Circo swivel-style cir...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>0c549116-75c8-56cb-8877-165380d0efd9</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Blue Panda Jumbo Dinosaur Floor Puzzle, Double...</td>\n",
" <td>https://www.target.com/p/blue-panda-jumbo-dino...</td>\n",
" <td>Blue Panda</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>80405355</td>\n",
" <td>Package Includes\\r\\nLarge Dinosaur Floor Puzzl...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>194425203808</td>\n",
" <td>USD</td>\n",
" <td>19.99</td>\n",
" <td>...</td>\n",
" <td>Puzzles</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Number of Pieces: 17 | Number of Pieces: 17 | ...</td>\n",
" <td>JUMBO DINOSAUR PUZZLE: This t-rex foam puzzle ...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>151c72b4-4856-502f-a508-961cc81fffa9</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Women's Round Aviator Sunglasses - Universal T...</td>\n",
" <td>https://www.target.com/p/women-39-s-round-avia...</td>\n",
" <td>Universal Thread</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>84201225</td>\n",
" <td>Round out your eyewear collection with the Rou...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>195995526496</td>\n",
" <td>USD</td>\n",
" <td>15.00</td>\n",
" <td>...</td>\n",
" <td>Eye Care</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Material: Metal (Frame) | Material: Metal (Fra...</td>\n",
" <td>Universal Thread round aviator sunglasses with...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>2a803c0f-00bf-50a6-a490-d381620ac3a3</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" title \\\n",
"8 NCAA Illinois Fighting Illini Circo Cheese Cut... \n",
"13 Blue Panda Jumbo Dinosaur Floor Puzzle, Double... \n",
"14 Women's Round Aviator Sunglasses - Universal T... \n",
"\n",
" url brand \\\n",
"8 https://www.target.com/p/ncaa-illinois-fightin... NCAA \n",
"13 https://www.target.com/p/blue-panda-jumbo-dino... Blue Panda \n",
"14 https://www.target.com/p/women-39-s-round-avia... Universal Thread \n",
"\n",
" main_image sku \\\n",
"8 https://target.scene7.com/is/image/Target/GUES... 79646040 \n",
"13 https://target.scene7.com/is/image/Target/GUES... 80405355 \n",
"14 https://target.scene7.com/is/image/Target/GUES... 84201225 \n",
"\n",
" description \\\n",
"8 Reach out to the complex cheese lover in your ... \n",
"13 Package Includes\\r\\nLarge Dinosaur Floor Puzzl... \n",
"14 Round out your eyewear collection with the Rou... \n",
"\n",
" raw_description gtin13 currency \\\n",
"8 <div class=\"h-margin-v-default\" data-test=\"ite... 99967205276 USD \n",
"13 <div class=\"h-margin-v-default\" data-test=\"ite... 194425203808 USD \n",
"14 <div class=\"h-margin-v-default\" data-test=\"ite... 195995526496 USD \n",
"\n",
" price ... sub_category_1 sub_category_2 \\\n",
"8 58.95 ... Sports Fan Shop Sports Fan Shop Home Goods \n",
"13 19.99 ... Puzzles NaN \n",
"14 15.00 ... Eye Care NaN \n",
"\n",
" sub_category_3 \\\n",
"8 Sports Fan Shop Barware & Drinkware \n",
"13 NaN \n",
"14 NaN \n",
"\n",
" images \\\n",
"8 https://target.scene7.com/is/image/Target/GUES... \n",
"13 https://target.scene7.com/is/image/Target/GUES... \n",
"14 https://target.scene7.com/is/image/Target/GUES... \n",
"\n",
" raw_specifications \\\n",
"8 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"13 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"14 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"\n",
" specifications \\\n",
"8 Number of Pieces: 5 | Number of Pieces: 5 | We... \n",
"13 Number of Pieces: 17 | Number of Pieces: 17 | ... \n",
"14 Material: Metal (Frame) | Material: Metal (Fra... \n",
"\n",
" highlights \\\n",
"8 BEAUTY & ELEGANCE - The Circo swivel-style cir... \n",
"13 JUMBO DINOSAUR PUZZLE: This t-rex foam puzzle ... \n",
"14 Universal Thread round aviator sunglasses with... \n",
"\n",
" raw_highlights \\\n",
"8 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"13 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"14 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"\n",
" uniq_id scraped_at \n",
"8 0c549116-75c8-56cb-8877-165380d0efd9 06/12/22 \n",
"13 151c72b4-4856-502f-a508-961cc81fffa9 06/12/22 \n",
"14 2a803c0f-00bf-50a6-a490-d381620ac3a3 06/12/22 \n",
"\n",
"[3 rows x 24 columns]"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sel = data.loc[:, \"raw_specifications\"].str.contains(\"Dimensions\")\n",
"dimensions = data.loc[sel]\n",
"sel_overall = ~dimensions.loc[sel, \"raw_specifications\"].str.contains(\"Overall\")\n",
"#for x in dimensions.loc[sel_overall, \"raw_specifications\"]:\n",
"# render_html(x)\n",
"dimensions.loc[sel_overall]\n",
"\n",
"# looks like \"dimenions\" can be \"Dimensions (Overall)\", \"Dimensions\" or other things\n",
"# like \"Assembled Dimensions\" or \"Piece X Dimensions\". But this latter two options are\n",
"# incomplete (lack the height), harder to parse and rare enough that I'll just drop them"
]
},
{
"cell_type": "code",
"execution_count": 359,
"id": "3fd14c61-c1f8-4b18-b04a-9b616e6cf9a7",
"metadata": {},
"outputs": [],
"source": [
"# There seem to be dupliucates on \"specifications\" that are not \n",
"# found on \"raw_specifications\"\n",
"# I think it's safe to just remove the duplicates\n",
"\n",
"specifications = data.loc[1,\"specifications\"]\n",
"\n",
"def parse_specs(specifications: str) -> dict[str,str]:\n",
" fields_mapping = {\n",
" \"Material\": \"materials\",\n",
" \"Package Quantity\": \"packaging\",\n",
" \"Number of Pieces\": \"packaging\",\n",
" \"Dimensions (Overall)\": \"dimensions\",\n",
" \"Dimensions\": \"dimensions\",\n",
" \"Weight\": \"weight\",\n",
" \"TCIN\": \"tcin\",\n",
" \"Origin\": \"origin\", \n",
" }\n",
" spec_dict = {}\n",
" for spec in specifications.split(\"|\"):\n",
" if \":\" in spec:\n",
" try:\n",
" field, value = spec.split(\":\")\n",
" except ValueError:\n",
" print(spec)\n",
" return {}\n",
" field = field.strip()\n",
" if field in fields_mapping:\n",
" field = fields_mapping[field]\n",
" spec_dict[field] = value.strip()\n",
" return spec_dict\n",
"\n",
"\n",
"\n",
"def iter_parse(root: ET.Element) -> dict[str,str]:\n",
" \"\"\"Recursively parse the XML tree into a dictionary\n",
" Each key/value pair is inside it's own <div> tag and\n",
" the key inside a <b> tag.\n",
" The fields that I believe are compulsory (TCIN, UPC and Origin)\n",
" are only nested one level deep, while the rest of fields seem\n",
" to be always nested two levels deep. But parsing it recursively\n",
" helps generalise both cases.\"\"\"\n",
" \n",
" spec_dict = {}\n",
" for child in root:\n",
" if child.tag == \"div\":\n",
" if \"b\" in [x.tag for x in child]:\n",
" key, *values = child.itertext()\n",
" key = key.strip(\":\")\n",
" value = \"\".join(values).strip(\":\")\n",
" spec_dict[key] = value\n",
" else:\n",
" spec_dict.update(iter_parse(child))\n",
" return spec_dict\n",
"\n",
"def parse_raw_specs(raw_specs: str) -> dict[str,str]:\n",
" \"\"\"Parse a raw specifications XML string into a dictionary\n",
" This involves first recursively parsing the XML tree and then\n",
" renaming the key values\"\"\"\n",
" \n",
" fields_mapping = {\n",
" \"Material\": \"materials\",\n",
" \"Package Quantity\": \"packaging\",\n",
" \"Number of Pieces\": \"packaging\",\n",
" \"Dimensions (Overall)\": \"dimensions\",\n",
" \"Dimensions\": \"dimensions\",\n",
" \"Weight\": \"weight\",\n",
" \"TCIN\": \"tcin\",\n",
" \"Origin\": \"origin\", \n",
" }\n",
" xml_root = ET.fromstring(raw_specs)\n",
" parsed = iter_parse(xml_root)\n",
" specs_dict = {\n",
" fields_mapping[key]: value\n",
" for key, value in parsed.items()\n",
" if key in fields_mapping\n",
" }\n",
" return specs_dict\n",
" \n",
"\n",
"def parse_value(specs: str, value: str) -> str:\n",
" return parse_raw_specs(specs).get(value)\n",
"\n",
"for x in [\"materials\", \"packaging\", \"dimensions\", \"weight\", \"tcin\", \"origin\"]:\n",
" data[x] = data[\"raw_specifications\"].apply(parse_value, value=x)\n",
"\n",
"# set(data[\"material\"])\n"
]
},
{
"cell_type": "code",
"execution_count": 305,
"id": "5902f1b3-a77e-4353-92aa-71cc729e8c87",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 {'width': 30.226000000000003}\n",
"2 None\n",
"3 None\n",
"4 {'height': 58.42, 'width': 2.54, 'depth': 58.42}\n",
" ... \n",
"162 {'width': 12.7, 'depth': 24.13}\n",
"163 {'height': 30.48, 'width': 30.48}\n",
"164 {'height': 12.065, 'width': 5.715, 'depth': 5....\n",
"165 None\n",
"166 {'height': 11.43, 'width': 31.75, 'depth': 11.43}\n",
"Name: dimensions, Length: 167, dtype: object"
]
},
"execution_count": 305,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"from typing import Optional\n",
"\n",
"dimensions = \"16 inches (H) x 23 inches (W) x 23 inches (D)\"\n",
"\n",
"def parse_dimensions_measure(dimensions: str, measure: str) -> Optional[dict[str,str]]:\n",
" expr = f\"(?P<value>\\d*[.,]?\\d*)\\s+(?P<unit>[a-zA-Z]*)\\s+\\({measure}\\)\" \n",
" if match := re.search(expr, dimensions):\n",
" return {\n",
" \"value\": float(match.group(\"value\")),\n",
" \"unit\": match.group(\"unit\").lower()\n",
" }\n",
"\n",
"def units_to_cm(value: float, unit: str) -> float:\n",
" conversions = {\n",
" \"inches\": 2.54,\n",
" \"feet\": 30.48,\n",
" \"cm\": 1\n",
" }\n",
" return value * conversions[unit]\n",
"\n",
"\n",
"def parse_dimensions(dimensions: Optional[str]) -> Optional[dict[str,float]]:\n",
" if dimensions is None:\n",
" return None\n",
" height = parse_dimensions_measure(dimensions, \"H\")\n",
" width = parse_dimensions_measure(dimensions, \"W\")\n",
" depth = parse_dimensions_measure(dimensions, \"D\")\n",
" dimensions = {\n",
" \"height\": height,\n",
" \"width\": width,\n",
" \"depth\": depth,\n",
" }\n",
" return {\n",
" key: units_to_cm(**value)\n",
" for key,value in dimensions.items()\n",
" if value is not None\n",
" }\n",
"\n",
"dimensions = data[\"dimensions\"].apply(parse_dimensions)\n",
"dimensions\n"
]
},
{
"cell_type": "code",
"execution_count": 341,
"id": "26ece1d2-c466-498c-aeb6-5610fb16c7d3",
"metadata": {},
"outputs": [],
"source": [
"def units_to_g(value: float, unit: str) -> float:\n",
" conversions = {\n",
" \"pounds\": 453.592,\n",
" \"ounces\": 28.3495,\n",
" \"g\": 1\n",
" }\n",
" return value * conversions[unit]\n",
"\n",
"def parse_weight(weight: str):\n",
" if weight is None:\n",
" return None\n",
" expr = f\"(?P<value>\\d*[.,]?\\d*)\\s+(?P<unit>[a-zA-Z]*)\"\n",
"\n",
" # strip is needed to prevent the regex from lazily\n",
" # matching just from the first whitespace separator,\n",
" # this could happen because the number part in the\n",
" # expression is technically all optional, to avoid\n",
" # an expression too complex and unreadable\n",
" if match := re.search(expr, weight.strip()):\n",
" value = float(match.group(\"value\"))\n",
" unit = match.group(\"unit\").lower()\n",
" return units_to_g(value, unit)\n",
"\n",
" return weight\n",
"\n",
"def calculate_dimensional_weight(dimensions: dict[str,float]):\n",
" \"\"\"The dimensional weight (in kg) is calculated as\n",
" Length * Height * Width (in cm) / 5000.\n",
" We'll return it in g here\"\"\"\n",
" if dimensions is None:\n",
" return None\n",
" \n",
" height = dimensions.get(\"height\")\n",
" width = dimensions.get(\"width\")\n",
" depth = dimensions.get(\"depth\")\n",
" if None in [height, width, depth]:\n",
" return None\n",
" \n",
" return height * width * depth / 5000\n",
"\n",
"weight = data[\"weight\"].apply(parse_weight)\n",
"# weight\n",
"\n",
"dimensional_weight = list(map(calculate_dimensional_weight, dimensions))\n",
"# dimensional_weight\n",
"\n",
"sel = weight.isnull()\n",
"weight[sel] = pd.Series(dimensional_weight)[sel]"
]
},
{
"cell_type": "code",
"execution_count": 404,
"id": "e6a61e8d-5c55-4312-a48a-a168817e7d77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 [cardboard]\n",
"2 None\n",
"3 [cardboard]\n",
"4 [metal]\n",
" ... \n",
"162 [plastic]\n",
"163 None\n",
"164 [fabric]\n",
"165 None\n",
"166 [stoneware]\n",
"Name: raw_specifications, Length: 167, dtype: object"
]
},
"execution_count": 404,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"materials = data[\"raw_specifications\"].apply(parse_value, value=\"materials\")\n",
"\n",
"# scoreable materials are:\n",
"# * metal\n",
"# * wood\n",
"# * glass\n",
"# * resin\n",
"# * fabric\n",
"# * plastic\n",
"\n",
"def material_classifier(material: str) -> str:\n",
" \"\"\"I will to match materials to one of the scoreable ones:\n",
" * metal\n",
" * wood\n",
" * glass\n",
" * resin\n",
" * fabric\n",
" * plastic\n",
" I found a few, like stoneware and cardboard that I can't fit\n",
" there, they'll have to remain unscored for now\"\"\"\n",
"\n",
" mapping = {\n",
" \"polyester\": \"fabric\",\n",
" \"spandex\": \"fabric\",\n",
" \"leather\": \"fabric\",\n",
" \"cardboard\": \"carboard\",\n",
" \"crystal\": \"glass\",\n",
" \"hardwood\": \"wood\",\n",
" \"plywood\": \"wood\",\n",
" \"mdf\": \"wood\",\n",
" \"wood\": \"wood\",\n",
" \"steel\": \"metal\",\n",
" \"polycarbonate\": \"plastic\",\n",
" \"polypropylene\": \"plastic\",\n",
" \"pvc\": \"plastic\",\n",
" \"resin\": \"plastic\",\n",
" \"stoneware\": \"stoneware\",\n",
" \"cardboard\": \"cardboard\",\n",
" \"paper\": \"cardboard\",\n",
" }\n",
" for key, value in mapping.items():\n",
" if key in material:\n",
" return value\n",
" return material\n",
"\n",
"def clean_material_name(material: str) -> str:\n",
" no_paren_annotations = re.sub(\"\\(.*\\)\", \"\", material)\n",
" no_amounts = re.sub(\"\\d+%?\", \"\", no_paren_annotations)\n",
" return no_amounts.strip().lower()\n",
" \n",
"def parse_materials(materials: str):\n",
" if materials is None:\n",
" return\n",
" material_ls = [\n",
" material_classifier(clean_material_name(x))\n",
" for x in materials.split(\",\")\n",
" ]\n",
" return list(set(material_ls))\n",
"\n",
"clean_materials = materials.apply(parse_materials)\n",
"clean_materials"
]
},
{
"cell_type": "code",
"execution_count": 419,
"id": "f6fcaae9-9c24-450c-8737-8b6aa0e406a9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1\n",
"1 1\n",
"2 1\n",
"3 24\n",
"4 1\n",
" ... \n",
"162 1\n",
"163 1\n",
"164 1\n",
"165 1\n",
"166 1\n",
"Name: raw_specifications, Length: 167, dtype: object"
]
},
"execution_count": 419,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"packaging = data[\"raw_specifications\"].apply(parse_value, value=\"packaging\")\n",
"packaging[packaging.isnull()] = 1\n",
"packaging"
]
},
{
"cell_type": "code",
"execution_count": 429,
"id": "f6ed2736-26d2-403f-9ab0-a228057176e4",
"metadata": {},
"outputs": [],
"source": [
"def clean_origin_name(origin: str) -> str:\n",
" mapping = {\n",
" \"assem usa w/foreign/dom. parts\": \"mixed\",\n",
" \"imported\": \"imported\",\n",
" \"made in the usa\": \"usa\",\n",
" \"made in the usa or imported\": \"mixed\",\n",
" }\n",
" origin = origin.lower().strip()\n",
" return mapping[origin]\n",
"\n",
"origin = data[\"raw_specifications\"].apply(parse_value, value=\"origin\")\n",
"clean_origin = origin.apply(clean_origin_name)\n"
]
},
{
"cell_type": "code",
"execution_count": 453,
"id": "d2df0c7e-cd03-4628-ae29-fae4f0f37dfa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(57, 9)"
]
},
"execution_count": 453,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_data = pd.DataFrame(\n",
" data={\n",
" \"materials\": clean_materials,\n",
" \"packaging\": packaging,\n",
" \"origin\": clean_origin,\n",
" \"weight\": weight,\n",
" \"height\": dimensions.apply(get_val, i=\"height\"),\n",
" \"width\": dimensions.apply(get_val, i=\"width\"),\n",
" \"depth\": dimensions.apply(get_val, i=\"depth\"),\n",
" \"tcin\": data[\"raw_specifications\"].apply(parse_value, value=\"tcin\"),\n",
" \"primary_category\": data[\"primary_category\"]\n",
" }\n",
")\n",
"\n",
"clean_data\n",
"\n",
"clean_data[~clean_data.isnull().any(axis=1)].shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}