dayrize-usecase/notebooks/exploration.ipynb

1448 lines
78 KiB
Plaintext
Raw Normal View History

2023-06-21 15:46:28 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 47,
2023-06-21 15:46:28 +02:00
"id": "98ded03d-0208-4416-a5e5-720a2e0742fa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>url</th>\n",
" <th>brand</th>\n",
" <th>main_image</th>\n",
" <th>sku</th>\n",
" <th>description</th>\n",
" <th>raw_description</th>\n",
" <th>gtin13</th>\n",
" <th>currency</th>\n",
" <th>price</th>\n",
" <th>...</th>\n",
" <th>sub_category_1</th>\n",
" <th>sub_category_2</th>\n",
" <th>sub_category_3</th>\n",
" <th>images</th>\n",
" <th>raw_specifications</th>\n",
" <th>specifications</th>\n",
" <th>highlights</th>\n",
" <th>raw_highlights</th>\n",
" <th>uniq_id</th>\n",
" <th>scraped_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NMR Distribution Zoltar Fortunes Playing Cards...</td>\n",
" <td>https://www.target.com/p/nmr-distribution-zolt...</td>\n",
" <td>NMR Distribution</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>81917300</td>\n",
" <td>Zoltar the Great Gypsy can see your future… Yo...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>840391145528</td>\n",
" <td>USD</td>\n",
" <td>10.99</td>\n",
" <td>...</td>\n",
" <td>Games</td>\n",
" <td>Adult Games</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Suggested Age: 6 Years and Up | Suggested Age:...</td>\n",
" <td>ZOLTAR CARDS: Zoltar the Great Gypsy can see y...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>898ca3c8-8bfa-5fac-a48e-53879845cf48</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>The Gifted Stationery 2021 - 2022 Monthly Wall...</td>\n",
" <td>https://www.target.com/p/the-gifted-stationery...</td>\n",
" <td>The Gifted Stationary</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>84821007</td>\n",
" <td>16 month wall calendar provides easy planning ...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>9781801433983</td>\n",
" <td>USD</td>\n",
" <td>12.99</td>\n",
" <td>...</td>\n",
" <td>Calendars</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Dimensions (Overall): 11.9 inches (L), 11.9 in...</td>\n",
" <td>16-MONTH CALENDAR: Easy planning and goal sett...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>b7c8e8ce-55b6-529d-b52e-dbfbd70b066e</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Doctor Who: Series Three, Part Two (DVD)</td>\n",
" <td>https://www.target.com/p/doctor-who-series-thr...</td>\n",
" <td>Warner Bros.</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>15432753</td>\n",
" <td>Tenth Doctor David Tennant (Broadchurch, Harry...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>883929408115</td>\n",
" <td>USD</td>\n",
" <td>10.89</td>\n",
" <td>...</td>\n",
" <td>Movies</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Movie Category: Television | Movie Category: T...</td>\n",
" <td>Run Time: 315:00 min | Disc Count: 2 | Rating:...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>97fd83d8-e76b-5bac-999c-b734fdcafe66</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Blue Panda 75 Pieces Tie Dye Birthday Party Su...</td>\n",
" <td>https://www.target.com/p/blue-panda-75-pieces-...</td>\n",
" <td>Blue Panda</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>84199597</td>\n",
" <td>Throwing a party has never been easier, weve ...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>194425194489</td>\n",
" <td>USD</td>\n",
" <td>26.99</td>\n",
" <td>...</td>\n",
" <td>Birthday Party Supplies</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Number of Pieces: 24 | Number of Pieces: 24 | ...</td>\n",
" <td>Serves 24: Includes 1 plastic table cover, a b...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>3b4d60b0-0888-596e-babe-d9e6e4e3131c</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Sullivans Hammered Metal Wall Medallions Set o...</td>\n",
" <td>https://www.target.com/p/sullivans-hammered-me...</td>\n",
" <td>Sullivans</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>86345566</td>\n",
" <td>Bring a little charm and delight to your space...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>23271231140</td>\n",
" <td>USD</td>\n",
" <td>118.99</td>\n",
" <td>...</td>\n",
" <td>Home Decor</td>\n",
" <td>Wall Decor</td>\n",
" <td>Wall Accents</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Dimensions (Overall): 23 inches (H) x 1 inches...</td>\n",
" <td>This unique wall set will be the perfect addit...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>036188d9-4546-5952-af8f-bc234d1f8113</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" title \\\n",
"0 NMR Distribution Zoltar Fortunes Playing Cards... \n",
"1 The Gifted Stationery 2021 - 2022 Monthly Wall... \n",
"2 Doctor Who: Series Three, Part Two (DVD) \n",
"3 Blue Panda 75 Pieces Tie Dye Birthday Party Su... \n",
"4 Sullivans Hammered Metal Wall Medallions Set o... \n",
"\n",
" url brand \\\n",
"0 https://www.target.com/p/nmr-distribution-zolt... NMR Distribution \n",
"1 https://www.target.com/p/the-gifted-stationery... The Gifted Stationary \n",
"2 https://www.target.com/p/doctor-who-series-thr... Warner Bros. \n",
"3 https://www.target.com/p/blue-panda-75-pieces-... Blue Panda \n",
"4 https://www.target.com/p/sullivans-hammered-me... Sullivans \n",
"\n",
" main_image sku \\\n",
"0 https://target.scene7.com/is/image/Target/GUES... 81917300 \n",
"1 https://target.scene7.com/is/image/Target/GUES... 84821007 \n",
"2 https://target.scene7.com/is/image/Target/GUES... 15432753 \n",
"3 https://target.scene7.com/is/image/Target/GUES... 84199597 \n",
"4 https://target.scene7.com/is/image/Target/GUES... 86345566 \n",
"\n",
" description \\\n",
"0 Zoltar the Great Gypsy can see your future… Yo... \n",
"1 16 month wall calendar provides easy planning ... \n",
"2 Tenth Doctor David Tennant (Broadchurch, Harry... \n",
"3 Throwing a party has never been easier, weve ... \n",
"4 Bring a little charm and delight to your space... \n",
"\n",
" raw_description gtin13 currency \\\n",
"0 <div class=\"h-margin-v-default\" data-test=\"ite... 840391145528 USD \n",
"1 <div class=\"h-margin-v-default\" data-test=\"ite... 9781801433983 USD \n",
"2 <div class=\"h-margin-v-default\" data-test=\"ite... 883929408115 USD \n",
"3 <div class=\"h-margin-v-default\" data-test=\"ite... 194425194489 USD \n",
"4 <div class=\"h-margin-v-default\" data-test=\"ite... 23271231140 USD \n",
"\n",
" price ... sub_category_1 sub_category_2 sub_category_3 \\\n",
"0 10.99 ... Games Adult Games NaN \n",
"1 12.99 ... Calendars NaN NaN \n",
"2 10.89 ... Movies NaN NaN \n",
"3 26.99 ... Birthday Party Supplies NaN NaN \n",
"4 118.99 ... Home Decor Wall Decor Wall Accents \n",
"\n",
" images \\\n",
"0 https://target.scene7.com/is/image/Target/GUES... \n",
"1 https://target.scene7.com/is/image/Target/GUES... \n",
"2 https://target.scene7.com/is/image/Target/GUES... \n",
"3 https://target.scene7.com/is/image/Target/GUES... \n",
"4 https://target.scene7.com/is/image/Target/GUES... \n",
"\n",
" raw_specifications \\\n",
"0 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"1 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"2 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"3 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"4 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"\n",
" specifications \\\n",
"0 Suggested Age: 6 Years and Up | Suggested Age:... \n",
"1 Dimensions (Overall): 11.9 inches (L), 11.9 in... \n",
"2 Movie Category: Television | Movie Category: T... \n",
"3 Number of Pieces: 24 | Number of Pieces: 24 | ... \n",
"4 Dimensions (Overall): 23 inches (H) x 1 inches... \n",
"\n",
" highlights \\\n",
"0 ZOLTAR CARDS: Zoltar the Great Gypsy can see y... \n",
"1 16-MONTH CALENDAR: Easy planning and goal sett... \n",
"2 Run Time: 315:00 min | Disc Count: 2 | Rating:... \n",
"3 Serves 24: Includes 1 plastic table cover, a b... \n",
"4 This unique wall set will be the perfect addit... \n",
"\n",
" raw_highlights \\\n",
"0 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"1 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"2 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"3 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"4 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"\n",
" uniq_id scraped_at \n",
"0 898ca3c8-8bfa-5fac-a48e-53879845cf48 06/12/22 \n",
"1 b7c8e8ce-55b6-529d-b52e-dbfbd70b066e 06/12/22 \n",
"2 97fd83d8-e76b-5bac-999c-b734fdcafe66 06/12/22 \n",
"3 3b4d60b0-0888-596e-babe-d9e6e4e3131c 06/12/22 \n",
"4 036188d9-4546-5952-af8f-bc234d1f8113 06/12/22 \n",
"\n",
"[5 rows x 24 columns]"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
2023-06-21 15:46:28 +02:00
"source": [
"import functools\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
2023-06-21 15:46:28 +02:00
"from IPython.display import display, HTML\n",
"\n",
"in_file = \"/home/jovyan/data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv\"\n",
"data = pd.read_csv(in_file)\n",
"data.head()"
2023-06-21 15:46:28 +02:00
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c20c14ea-9ef6-4d40-8b7d-4731d0866239",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['title', 'url', 'brand', 'main_image', 'sku', 'description',\n",
" 'raw_description', 'gtin13', 'currency', 'price', 'availability',\n",
" 'availableDeliveryMethod', 'available_branch', 'primary_category',\n",
" 'sub_category_1', 'sub_category_2', 'sub_category_3', 'images',\n",
" 'raw_specifications', 'specifications', 'highlights', 'raw_highlights',\n",
" 'uniq_id', 'scraped_at'],\n",
" dtype='object')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.axes[1]"
]
},
{
"cell_type": "code",
"execution_count": 3,
2023-06-21 15:46:28 +02:00
"id": "165723d1-8152-4e30-b25a-cbca7f4935f9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def render_html(html: str):\n",
" \"\"\"Render an html string\"\"\"\n",
" display(HTML(html))\n",
"\n",
2023-06-21 15:46:28 +02:00
"render_html(data.loc[0, 'raw_specifications'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
2023-06-21 15:46:28 +02:00
"id": "d153a732-f18c-4fb7-9282-9c02c49c3cc0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 11.9 inches (L), 11.9 inches (W)</div><hr/></div><div><div><b>Dated format:</b> Monthly</div><hr/></div><div><div><b>Calendar year:</b> 2022</div><hr/></div><div><div><b>Material:</b> Paper</div><hr/></div><div><b>TCIN</b>: <!-- -->84821007<hr/></div><div><b>UPC</b>: <!-- -->9781801433983<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def look_for_matches(data: pd.DataFrame, pattern: str, colname : str = \"raw_specifications\") -> str:\n",
" \"\"\"Useful for finding cells in raw_specifications containing a given string\"\"\"\n",
" return data.loc[data.loc[:, colname].str.contains(pattern), colname].iloc[0]\n",
"\n",
2023-06-21 15:46:28 +02:00
"render_html(look_for_matches(data, \"Material\"))"
]
},
{
"cell_type": "code",
"execution_count": 6,
2023-06-21 15:46:28 +02:00
"id": "b06e66af-e0e3-466c-8797-546d5e076f32",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Contains:</b> Does Not Contain Any of the 8 Major Allergens</div><hr/></div><div><div><b>Dietary Needs:</b> Gluten Free</div><hr/></div><div><div><b>Form:</b> Pieces</div><hr/></div><div><div><b>State of Readiness:</b> Ready to Eat</div><hr/></div><div><div><b>Package Quantity:</b> 1</div><hr/></div><div><div><b>Net weight:</b> 15.6 Ounces</div><hr/></div><div><b>TCIN</b>: <!-- -->54571204<hr/></div><div><b>UPC</b>: <!-- -->022000279729<hr/></div><div><b>Item Number (DPCI)</b>: <!-- -->055-02-1211<hr/></div><div><b>Origin</b>: <!-- -->Made in the USA or Imported<hr/></div><div><b>Grocery Disclaimer</b>:<!-- --> <div>Content on this site is for reference purposes only. Target does not represent or warrant that the nutrition, ingredient, allergen and other product information on our Web or Mobile sites are accurate or complete, since this information comes from the product manufacturers. On occasion, manufacturers may improve or change their product formulas and update their labels. We recommend that you do not rely solely on the information presented on our Web or Mobile sites and that you review the product's label or contact the manufacturer directly if you have specific product concerns or questions. If you have specific healthcare concerns or questions about the products displayed, please contact your licensed healthcare professional for advice or answers. Any additional pictures are suggested servings only.</div></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Package\"))"
]
},
{
"cell_type": "code",
"execution_count": 7,
2023-06-21 15:46:28 +02:00
"id": "015a0d99-b4c2-47cf-9e23-59485a450470",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 23 inches (H) x 1 inches (W) x 23 inches (D)</div><hr/></div><div><div><b>Weight:</b> 4.65 pounds</div><hr/></div><div><div><b>Art subject:</b> Geometric Shapes</div><hr/></div><div><div><b>Orientation:</b> Vertical</div><hr/></div><div><div><b>Material:</b> Metal</div><hr/></div><div><div><b>Battery:</b> No Battery Used</div><hr/></div><div><b>TCIN</b>: <!-- -->86345566<hr/></div><div><b>UPC</b>: <!-- -->023271231140<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Weight\"))"
]
},
{
"cell_type": "code",
"execution_count": 8,
2023-06-21 15:46:28 +02:00
"id": "99cbfba8-4324-4d2a-b2e2-d766a5e57964",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Dimensions (Overall):</b> 11.9 inches (L), 11.9 inches (W)</div><hr/></div><div><div><b>Dated format:</b> Monthly</div><hr/></div><div><div><b>Calendar year:</b> 2022</div><hr/></div><div><div><b>Material:</b> Paper</div><hr/></div><div><b>TCIN</b>: <!-- -->84821007<hr/></div><div><b>UPC</b>: <!-- -->9781801433983<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Dimensions\"))"
]
},
{
"cell_type": "code",
"execution_count": 9,
2023-06-21 15:46:28 +02:00
"id": "6c05e1db-7181-422e-924b-e67a28886abe",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"TCIN\"))"
]
},
{
"cell_type": "code",
"execution_count": 10,
2023-06-21 15:46:28 +02:00
"id": "bf3a07d9-11c0-48c4-be03-373a3ba5f755",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight\" data-test=\"item-details-specifications\"><h3 class=\"h-text-bs h-margin-b-tight\">Specifications</h3><div><div><b>Suggested Age:</b> 6 Years and Up</div><hr/></div><div><div><b>CPSC Choking Hazard Warnings:</b> Choking_hazard_small_parts</div><hr/></div><div><b>TCIN</b>: <!-- -->81917300<hr/></div><div><b>UPC</b>: <!-- -->840391145528<hr/></div><div><b>Origin</b>: <!-- -->imported<hr/></div><div data-test=\"itemDetailsTabMarketplaceMessage\"><p class=\"h-padding-t-x2\">The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.</p><p class=\"h-padding-t-x2\">We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.</p></div></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"render_html(look_for_matches(data, \"Origin\"))"
]
},
{
"cell_type": "code",
"execution_count": 11,
2023-06-21 15:46:28 +02:00
"id": "3853dd07-1977-44c6-bc97-2ecdaa98980e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"raw_specifications\n",
"False 124\n",
"True 43\n",
"Name: count, dtype: int64"
]
},
"execution_count": 11,
2023-06-21 15:46:28 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colname = \"raw_specifications\"\n",
"patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n",
"\n",
"(data.loc[:, colname].str.contains(\"Package Quantity\") | data.loc[:, colname].str.contains(\"Number of Pieces\")).value_counts()\n",
"\n",
"# Package Quantity and Number of Pieces are never found together. Maybe they refer to the same thing?"
]
},
{
"cell_type": "code",
"execution_count": 12,
2023-06-21 15:46:28 +02:00
"id": "be8a7628-9199-4f25-a533-d041a7ff540c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>url</th>\n",
" <th>brand</th>\n",
" <th>main_image</th>\n",
" <th>sku</th>\n",
" <th>description</th>\n",
" <th>raw_description</th>\n",
" <th>gtin13</th>\n",
" <th>currency</th>\n",
" <th>price</th>\n",
" <th>...</th>\n",
" <th>sub_category_1</th>\n",
" <th>sub_category_2</th>\n",
" <th>sub_category_3</th>\n",
" <th>images</th>\n",
" <th>raw_specifications</th>\n",
" <th>specifications</th>\n",
" <th>highlights</th>\n",
" <th>raw_highlights</th>\n",
" <th>uniq_id</th>\n",
" <th>scraped_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>NCAA Illinois Fighting Illini Circo Cheese Cut...</td>\n",
" <td>https://www.target.com/p/ncaa-illinois-fightin...</td>\n",
" <td>NCAA</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>79646040</td>\n",
" <td>Reach out to the complex cheese lover in your ...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>99967205276</td>\n",
" <td>USD</td>\n",
" <td>58.95</td>\n",
" <td>...</td>\n",
" <td>Sports Fan Shop</td>\n",
" <td>Sports Fan Shop Home Goods</td>\n",
" <td>Sports Fan Shop Barware &amp; Drinkware</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Number of Pieces: 5 | Number of Pieces: 5 | We...</td>\n",
" <td>BEAUTY &amp; ELEGANCE - The Circo swivel-style cir...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>0c549116-75c8-56cb-8877-165380d0efd9</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Blue Panda Jumbo Dinosaur Floor Puzzle, Double...</td>\n",
" <td>https://www.target.com/p/blue-panda-jumbo-dino...</td>\n",
" <td>Blue Panda</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>80405355</td>\n",
" <td>Package Includes\\r\\nLarge Dinosaur Floor Puzzl...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>194425203808</td>\n",
" <td>USD</td>\n",
" <td>19.99</td>\n",
" <td>...</td>\n",
" <td>Puzzles</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Number of Pieces: 17 | Number of Pieces: 17 | ...</td>\n",
" <td>JUMBO DINOSAUR PUZZLE: This t-rex foam puzzle ...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>151c72b4-4856-502f-a508-961cc81fffa9</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Women's Round Aviator Sunglasses - Universal T...</td>\n",
" <td>https://www.target.com/p/women-39-s-round-avia...</td>\n",
" <td>Universal Thread</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>84201225</td>\n",
" <td>Round out your eyewear collection with the Rou...</td>\n",
" <td>&lt;div class=\"h-margin-v-default\" data-test=\"ite...</td>\n",
" <td>195995526496</td>\n",
" <td>USD</td>\n",
" <td>15.00</td>\n",
" <td>...</td>\n",
" <td>Eye Care</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>https://target.scene7.com/is/image/Target/GUES...</td>\n",
" <td>&lt;div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...</td>\n",
" <td>Material: Metal (Frame) | Material: Metal (Fra...</td>\n",
" <td>Universal Thread round aviator sunglasses with...</td>\n",
" <td>&lt;li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\"&gt;...</td>\n",
" <td>2a803c0f-00bf-50a6-a490-d381620ac3a3</td>\n",
" <td>06/12/22</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" title \\\n",
"8 NCAA Illinois Fighting Illini Circo Cheese Cut... \n",
"13 Blue Panda Jumbo Dinosaur Floor Puzzle, Double... \n",
"14 Women's Round Aviator Sunglasses - Universal T... \n",
"\n",
" url brand \\\n",
"8 https://www.target.com/p/ncaa-illinois-fightin... NCAA \n",
"13 https://www.target.com/p/blue-panda-jumbo-dino... Blue Panda \n",
"14 https://www.target.com/p/women-39-s-round-avia... Universal Thread \n",
"\n",
" main_image sku \\\n",
"8 https://target.scene7.com/is/image/Target/GUES... 79646040 \n",
"13 https://target.scene7.com/is/image/Target/GUES... 80405355 \n",
"14 https://target.scene7.com/is/image/Target/GUES... 84201225 \n",
"\n",
" description \\\n",
"8 Reach out to the complex cheese lover in your ... \n",
"13 Package Includes\\r\\nLarge Dinosaur Floor Puzzl... \n",
"14 Round out your eyewear collection with the Rou... \n",
"\n",
" raw_description gtin13 currency \\\n",
"8 <div class=\"h-margin-v-default\" data-test=\"ite... 99967205276 USD \n",
"13 <div class=\"h-margin-v-default\" data-test=\"ite... 194425203808 USD \n",
"14 <div class=\"h-margin-v-default\" data-test=\"ite... 195995526496 USD \n",
"\n",
" price ... sub_category_1 sub_category_2 \\\n",
"8 58.95 ... Sports Fan Shop Sports Fan Shop Home Goods \n",
"13 19.99 ... Puzzles NaN \n",
"14 15.00 ... Eye Care NaN \n",
"\n",
" sub_category_3 \\\n",
"8 Sports Fan Shop Barware & Drinkware \n",
"13 NaN \n",
"14 NaN \n",
"\n",
" images \\\n",
"8 https://target.scene7.com/is/image/Target/GUES... \n",
"13 https://target.scene7.com/is/image/Target/GUES... \n",
"14 https://target.scene7.com/is/image/Target/GUES... \n",
"\n",
" raw_specifications \\\n",
"8 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"13 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"14 <div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd... \n",
"\n",
" specifications \\\n",
"8 Number of Pieces: 5 | Number of Pieces: 5 | We... \n",
"13 Number of Pieces: 17 | Number of Pieces: 17 | ... \n",
"14 Material: Metal (Frame) | Material: Metal (Fra... \n",
"\n",
" highlights \\\n",
"8 BEAUTY & ELEGANCE - The Circo swivel-style cir... \n",
"13 JUMBO DINOSAUR PUZZLE: This t-rex foam puzzle ... \n",
"14 Universal Thread round aviator sunglasses with... \n",
"\n",
" raw_highlights \\\n",
"8 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"13 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"14 <li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">... \n",
"\n",
" uniq_id scraped_at \n",
"8 0c549116-75c8-56cb-8877-165380d0efd9 06/12/22 \n",
"13 151c72b4-4856-502f-a508-961cc81fffa9 06/12/22 \n",
"14 2a803c0f-00bf-50a6-a490-d381620ac3a3 06/12/22 \n",
"\n",
"[3 rows x 24 columns]"
]
},
"execution_count": 12,
2023-06-21 15:46:28 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sel = data.loc[:, \"raw_specifications\"].str.contains(\"Dimensions\")\n",
"dimensions = data.loc[sel]\n",
"sel_overall = ~dimensions.loc[sel, \"raw_specifications\"].str.contains(\"Overall\")\n",
"#for x in dimensions.loc[sel_overall, \"raw_specifications\"]:\n",
"# render_html(x)\n",
"dimensions.loc[sel_overall]\n",
"\n",
"# looks like \"dimenions\" can be \"Dimensions (Overall)\", \"Dimensions\" or other things\n",
"# like \"Assembled Dimensions\" or \"Piece X Dimensions\". But this latter two options are\n",
"# incomplete (lack the height), harder to parse and rare enough that I'll just drop them"
]
},
{
"cell_type": "code",
"execution_count": 13,
2023-06-21 15:46:28 +02:00
"id": "3fd14c61-c1f8-4b18-b04a-9b616e6cf9a7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"Dimensions (Overall): 11.9 inches (L), 11.9 inches (W) | Dimensions (Overall): 11.9 inches (L), 11.9 inches (W) | Dated format: Monthly | Dated format: Monthly | Calendar year: 2022 | Calendar year: 2022 | Material: Paper | Material: Paper | TCIN: 84821007 | UPC: 9781801433983 | Origin: imported | The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.\""
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
2023-06-21 15:46:28 +02:00
"source": [
"import xml.etree.ElementTree as ET\n",
"\n",
2023-06-21 15:46:28 +02:00
"# There seem to be dupliucates on \"specifications\" that are not \n",
"# found on \"raw_specifications\"\n",
"# I think it's safe to just remove the duplicates\n",
"\n",
"specifications = data.loc[1,\"specifications\"]\n",
"specifications\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "88894d7c-dc68-42ee-9c9d-a53474762a7a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 None\n",
"1 Paper\n",
"2 None\n",
"3 Paper\n",
"4 Metal\n",
" ... \n",
"162 Plastic\n",
"163 None\n",
"164 Fabric\n",
"165 None\n",
"166 Stoneware\n",
"Name: raw_specifications, Length: 167, dtype: object\n",
"0 None\n",
"1 None\n",
"2 None\n",
"3 24\n",
"4 None\n",
" ... \n",
"162 None\n",
"163 None\n",
"164 None\n",
"165 None\n",
"166 1\n",
"Name: raw_specifications, Length: 167, dtype: object\n",
"0 None\n",
"1 11.9 inches (L), 11.9 inches (W)\n",
"2 None\n",
"3 None\n",
"4 23 inches (H) x 1 inches (W) x 23 inches (D)\n",
" ... \n",
"162 5.0 inches (W) x 9.5 inches (D)\n",
"163 12.0 inches (H) x 12.0 inches (W)\n",
"164 4.75 inches (H) x 2.25 inches (W) x 2.25 inch...\n",
"165 None\n",
"166 4.5 inches (H) x 12.5 inches (W) x 4.5 inches...\n",
"Name: raw_specifications, Length: 167, dtype: object\n",
"0 None\n",
"1 None\n",
"2 None\n",
"3 None\n",
"4 4.65 pounds\n",
" ... \n",
"162 .28 pounds\n",
"163 32.0 pounds\n",
"164 None\n",
"165 None\n",
"166 None\n",
"Name: raw_specifications, Length: 167, dtype: object\n",
"0 81917300\n",
"1 84821007\n",
"2 15432753\n",
"3 84199597\n",
"4 86345566\n",
" ... \n",
"162 83388852\n",
"163 80836585\n",
"164 75477923\n",
"165 85634544\n",
"166 80239765\n",
"Name: raw_specifications, Length: 167, dtype: object\n",
"0 imported\n",
"1 imported\n",
"2 Made in the USA\n",
"3 imported\n",
"4 imported\n",
" ... \n",
"162 made in the USA or imported\n",
"163 made in the USA or imported\n",
"164 made in the USA or imported\n",
"165 imported\n",
"166 imported\n",
"Name: raw_specifications, Length: 167, dtype: object\n"
]
}
],
"source": [
2023-06-21 15:46:28 +02:00
"def parse_specs(specifications: str) -> dict[str,str]:\n",
" fields_mapping = {\n",
" \"Material\": \"materials\",\n",
" \"Package Quantity\": \"packaging\",\n",
" \"Number of Pieces\": \"packaging\",\n",
" \"Dimensions (Overall)\": \"dimensions\",\n",
" \"Dimensions\": \"dimensions\",\n",
" \"Weight\": \"weight\",\n",
" \"TCIN\": \"tcin\",\n",
" \"Origin\": \"origin\", \n",
" }\n",
" spec_dict = {}\n",
" for spec in specifications.split(\"|\"):\n",
" if \":\" in spec:\n",
" try:\n",
" field, value = spec.split(\":\")\n",
" except ValueError:\n",
" print(spec)\n",
" return {}\n",
" field = field.strip()\n",
" if field in fields_mapping:\n",
" field = fields_mapping[field]\n",
" spec_dict[field] = value.strip()\n",
" return spec_dict\n",
"\n",
"\n",
"\n",
"def iter_parse(root: ET.Element) -> dict[str,str]:\n",
" \"\"\"Recursively parse the XML tree into a dictionary\n",
" Each key/value pair is inside it's own <div> tag and\n",
" the key inside a <b> tag.\n",
" The fields that I believe are compulsory (TCIN, UPC and Origin)\n",
" are only nested one level deep, while the rest of fields seem\n",
" to be always nested two levels deep. But parsing it recursively\n",
" helps generalise both cases.\"\"\"\n",
" \n",
" spec_dict = {}\n",
" for child in root:\n",
" if child.tag == \"div\":\n",
" if \"b\" in [x.tag for x in child]:\n",
" key, *values = child.itertext()\n",
" key = key.strip(\":\")\n",
" value = \"\".join(values).strip(\":\")\n",
" spec_dict[key] = value\n",
" else:\n",
" spec_dict.update(iter_parse(child))\n",
" return spec_dict\n",
"\n",
"def parse_raw_specs(raw_specs: str) -> dict[str,str]:\n",
" \"\"\"Parse a raw specifications XML string into a dictionary\n",
" This involves first recursively parsing the XML tree and then\n",
" renaming the key values\"\"\"\n",
" \n",
" fields_mapping = {\n",
" \"Material\": \"materials\",\n",
" \"Package Quantity\": \"packaging\",\n",
" \"Number of Pieces\": \"packaging\",\n",
" \"Dimensions (Overall)\": \"dimensions\",\n",
" \"Dimensions\": \"dimensions\",\n",
" \"Weight\": \"weight\",\n",
" \"TCIN\": \"tcin\",\n",
" \"Origin\": \"origin\", \n",
" }\n",
" xml_root = ET.fromstring(raw_specs)\n",
" parsed = iter_parse(xml_root)\n",
" specs_dict = {\n",
" fields_mapping[key]: value\n",
" for key, value in parsed.items()\n",
" if key in fields_mapping\n",
" }\n",
" return specs_dict\n",
" \n",
"\n",
"def parse_value(specs: str, value: str) -> str:\n",
" return parse_raw_specs(specs).get(value)\n",
"\n",
"for x in [\"materials\", \"packaging\", \"dimensions\", \"weight\", \"tcin\", \"origin\"]:\n",
" print(data[\"raw_specifications\"].apply(parse_value, value=x))\n"
2023-06-21 15:46:28 +02:00
]
},
{
"cell_type": "code",
"execution_count": 17,
2023-06-21 15:46:28 +02:00
"id": "5902f1b3-a77e-4353-92aa-71cc729e8c87",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 {'width': 30.226000000000003}\n",
"2 None\n",
"3 None\n",
"4 {'height': 58.42, 'width': 2.54, 'depth': 58.42}\n",
" ... \n",
"162 {'width': 12.7, 'depth': 24.13}\n",
"163 {'height': 30.48, 'width': 30.48}\n",
"164 {'height': 12.065, 'width': 5.715, 'depth': 5....\n",
"165 None\n",
"166 {'height': 11.43, 'width': 31.75, 'depth': 11.43}\n",
"Name: raw_specifications, Length: 167, dtype: object"
2023-06-21 15:46:28 +02:00
]
},
"execution_count": 17,
2023-06-21 15:46:28 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"from typing import Optional\n",
"\n",
"def parse_dimensions_measure(dimensions: str, measure: str) -> Optional[dict[str,str]]:\n",
" expr = f\"(?P<value>\\d*[.,]?\\d*)\\s+(?P<unit>[a-zA-Z]*)\\s+\\({measure}\\)\" \n",
" if match := re.search(expr, dimensions):\n",
" return {\n",
" \"value\": float(match.group(\"value\")),\n",
" \"unit\": match.group(\"unit\").lower()\n",
" }\n",
"\n",
"def units_to_cm(value: float, unit: str) -> float:\n",
" conversions = {\n",
" \"inches\": 2.54,\n",
" \"feet\": 30.48,\n",
" \"cm\": 1\n",
" }\n",
" return value * conversions[unit]\n",
"\n",
"\n",
"def parse_dimensions(dimensions: Optional[str]) -> Optional[dict[str,float]]:\n",
" if dimensions is None:\n",
" return None\n",
" height = parse_dimensions_measure(dimensions, \"H\")\n",
" width = parse_dimensions_measure(dimensions, \"W\")\n",
" depth = parse_dimensions_measure(dimensions, \"D\")\n",
" dimensions = {\n",
" \"height\": height,\n",
" \"width\": width,\n",
" \"depth\": depth,\n",
" }\n",
" return {\n",
" key: units_to_cm(**value)\n",
" for key,value in dimensions.items()\n",
" if value is not None\n",
" }\n",
"\n",
"dimensions = data[\"raw_specifications\"].apply(parse_value, value=\"dimensions\").apply(parse_dimensions)\n",
"dimensions"
2023-06-21 15:46:28 +02:00
]
},
{
"cell_type": "code",
"execution_count": 51,
2023-06-21 15:46:28 +02:00
"id": "26ece1d2-c466-498c-aeb6-5610fb16c7d3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 NaN\n",
"1 NaN\n",
"2 NaN\n",
"3 NaN\n",
"4 2109.202800\n",
" ... \n",
"162 127.005760\n",
"163 14514.944000\n",
"164 0.078812\n",
"165 NaN\n",
"166 0.829595\n",
"Name: raw_specifications, Length: 167, dtype: float64"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
2023-06-21 15:46:28 +02:00
"source": [
"def units_to_g(value: float, unit: str) -> float:\n",
" conversions = {\n",
" \"pounds\": 453.592,\n",
" \"ounces\": 28.3495,\n",
" \"g\": 1\n",
" }\n",
" return value * conversions[unit]\n",
"\n",
"def parse_weight(weight: str):\n",
" if weight is None:\n",
" return None\n",
" expr = f\"(?P<value>\\d*[.,]?\\d*)\\s+(?P<unit>[a-zA-Z]*)\"\n",
"\n",
" # strip is needed to prevent the regex from lazily\n",
" # matching just from the first whitespace separator,\n",
" # this could happen because the number part in the\n",
" # expression is technically all optional, to avoid\n",
" # an expression too complex and unreadable\n",
" if match := re.search(expr, weight.strip()):\n",
" value = float(match.group(\"value\"))\n",
" unit = match.group(\"unit\").lower()\n",
" return units_to_g(value, unit)\n",
"\n",
" return weight\n",
"\n",
"def calculate_dimensional_weight(dimensions: dict[str,float]):\n",
" \"\"\"The dimensional weight (in kg) is calculated as\n",
" Length * Height * Width (in cm) / 5000.\n",
" We'll return it in g here\"\"\"\n",
" if dimensions is None:\n",
" return None\n",
" \n",
" height = dimensions.get(\"height\")\n",
" width = dimensions.get(\"width\")\n",
" depth = dimensions.get(\"depth\")\n",
" if None in [height, width, depth]:\n",
" return None\n",
" \n",
" return height * width * depth / 5000\n",
"\n",
"weight = data[\"raw_specifications\"].apply(parse_value, value=\"weight\").apply(parse_weight)\n",
2023-06-21 15:46:28 +02:00
"\n",
"dimensional_weight = list(map(calculate_dimensional_weight, dimensions))\n",
"\n",
"sel = weight.isnull()\n",
"weight[sel] = pd.Series(dimensional_weight)[sel]\n",
"\n",
"weight"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "49c7495f-1f36-4dc9-8686-de3af7b230a9",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAxjklEQVR4nO3de3BUZZ7/8U9jQidgiFwknSwEwhojEGEwsBAUE0XCgLK6uDuOIsKOTslwN0WlDPxhcEqCFMNGFoVBkcuyiDsbUKZATKaGBBVQCKHkZsQ1QzKYmI0LCdeEy/P7w0n/aHKBTrqfpOP7VXWqPOc855zv04d+/OT0Od0OY4wRAACAJR1auwAAAPDTQvgAAABWET4AAIBVhA8AAGAV4QMAAFhF+AAAAFYRPgAAgFWEDwAAYFVQaxdwo2vXrum7775TWFiYHA5Ha5cD/CQZY3T27FlFRUWpQ4fA+BuFsQNoXd6MG20ufHz33Xfq3bt3a5cBQFJpaal69erV2mXcEsYOoG24lXGjzYWPsLAwST8W36VLl1auBvhpqq6uVu/evd3vx0DA2AG0Lm/GjTYXPuoul3bp0oUBBGhlgfTxBWMH0DbcyrgRGB/mAgCAdoPwAQAArCJ8AAAAqwgfAADAKq/Dx6lTp/Tss8+qe/fu6tSpk372s5+poKDAvd4Yo4yMDEVFRSk0NFTJyck6evSoT4sGAACBy6vwcfr0ad1///0KDg7WRx99pGPHjul3v/ud7rjjDnebJUuWaNmyZVqxYoX2798vl8ulMWPG6OzZs76uHQAABCCvHrV9/fXX1bt3b61du9a9rG/fvu7/NsYoKytLCxYs0MSJEyVJ69evV0REhDZt2qQXX3zRN1UDAICA5dWVj23btmno0KH6l3/5F/Xs2VNDhgzR22+/7V5fXFys8vJypaSkuJc5nU4lJSVpz549De6zpqZG1dXVHhMAAGi/vAof3377rVauXKnY2Fh9/PHHmjZtmmbPnq0NGzZIksrLyyVJERERHttFRES4190oMzNT4eHh7omvRwYAoH3zKnxcu3ZN9913nxYtWqQhQ4boxRdf1K9//WutXLnSo92N325mjGn0G8/S09NVVVXlnkpLS73sAgAACCRehY/IyEgNGDDAY1n//v1VUlIiSXK5XJJU7ypHRUVFvashdZxOp/vrkPlaZCDwrVy5UoMGDXK/nxMTE/XRRx81uU1+fr4SEhIUEhKifv36adWqVZaqBdAavAof999/v4qKijyWff311+rTp48kKSYmRi6XS7m5ue71tbW1ys/P18iRI31QLoC2rlevXlq8eLEOHDigAwcO6OGHH9bjjz/e6CP3xcXFGj9+vEaNGqXCwkLNnz9fs2fPVnZ2tuXKAVhjvPDFF1+YoKAg89prr5kTJ06Y//zP/zSdOnUyGzdudLdZvHixCQ8PN1u2bDGHDx82Tz/9tImMjDTV1dW3dIyqqiojyVRVVXlTGgAf8vX7sGvXruadd95pcF1aWpq55557PJa9+OKLZsSIEV4dg7EDaF3evAe9etR22LBh2rp1q9LT0/Xqq68qJiZGWVlZmjRpkrtNWlqaLl68qOnTp+v06dMaPny4cnJy/PbT3CUlJaqsrHTP9+jRQ9HR0X45FgDvXL16VX/4wx90/vx5JSYmNthm7969Hk/ISdLYsWO1Zs0aXb58WcHBwQ1uV1NTo5qaGve8L5+Uu3FckRhbAF/yKnxI0mOPPabHHnus0fUOh0MZGRnKyMhoSV23pKSkRHFx/XXp0gX3spCQTioqOs4gAbSiw4cPKzExUZcuXdLtt9+urVu31rtfrE55eXmDT8hduXJFlZWVioyMbHC7zMxMLVy40Oe1NzSuSIwtgC8F9G+7VFZW/m2A2CipQNJGXbp0od5fLADsiouL06FDh7Rv3z795je/0ZQpU3Ts2LFG2zf0hFxDy6/nryfl6o8rjC2Ar3l95aNt6i/pvtYuAsDfdOzYUXfddZckaejQodq/f7/eeOMN/f73v6/X1uVyNfiEXFBQkLp3797oMZxOp5xOp28L98C4AvhLQF/5ABAYjDEe92dcLzEx0eMJOUnKycnR0KFDG73fA0BgI3wA8Kn58+frk08+0V/+8hcdPnxYCxYsUF5envvG9PT0dD333HPu9tOmTdPJkyeVmpqq48eP691339WaNWs0b9681uoCAD9rJx+7AGgrvv/+e02ePFllZWUKDw/XoEGDtHPnTo0ZM0aSVFZW5v5iQunH7wfasWOHXnrpJb355puKiorS8uXL9eSTT7ZWFwD4GeEDgE+tWbOmyfXr1q2rtywpKUkHDx70U0UA2ho+dgEAAFYRPgAAgFWEDwAAYBXhAwAAWEX4AAAAVhE+AACAVYQPAABgFeEDAABYRfgAAABWET4AAIBVhA8AAGAV4QMAAFhF+AAAAFYRPgAAgFWEDwAAYBXhAwAAWEX4AAAAVhE+AACAVYQPAABgFeEDAABYRfgAAABWET4AAIBVhA8AAGAV4QMAAFhF+AAAAFYRPgAAgFWEDwAAYBXhAwAAWEX4AAAAVhE+AACAVYQPAABgFeEDAABYRfgAAABWET4AAIBVhA8AAGAV4QMAAFhF+AAAAFYRPgAAgFWEDwAAYBXhAwAAWEX4AAAAVhE+AACAVYQPAABglVfhIyMjQw6Hw2NyuVzu9cYYZWRkKCoqSqGhoUpOTtbRo0d9XjQAAAhcXl/5GDhwoMrKytzT4cOH3euWLFmiZcuWacWKFdq/f79cLpfGjBmjs2fP+rRoAAAQuLwOH0FBQXK5XO7pzjvvlPTjVY+srCwtWLBAEydOVHx8vNavX68LFy5o06ZNPi8cAAAEJq/Dx4kTJxQVFaWYmBj98pe/1LfffitJKi4uVnl5uVJSUtxtnU6nkpKStGfPnkb3V1NTo+rqao8JAAC0X16Fj+HDh2vDhg36+OOP9fbbb6u8vFwjR47UDz/8oPLycklSRESExzYRERHudQ3JzMxUeHi4e+rdu3czugEAAAKFV+Fj3LhxevLJJ3XvvffqkUce0fbt2yVJ69evd7dxOBwe2xhj6i27Xnp6uqqqqtxTaWmpNyUBaGMyMzM1bNgwhYWFqWfPnnriiSdUVFTU5DZ5eXn1bmZ3OBz66quvLFUNwKYWPWrbuXNn3XvvvTpx4oT7qZcbr3JUVFTUuxpyPafTqS5dunhMAAJXfn6+ZsyYoX379ik3N1dXrlxRSkqKzp8/f9Nti4qKPG5oj42NtVAxANuCWrJxTU2Njh8/rlGjRikmJkYul0u5ubkaMmSIJKm2tlb5+fl6/fXXfVIsgLZv586dHvNr165Vz549VVBQoAcffLDJbXv27Kk77rjDj9UBaAu8uvIxb9485efnq7i4WJ9//rn++Z//WdXV1ZoyZYocDofmzp2rRYsWaevWrTpy5IimTp2qTp066ZlnnvFX/QDauKqqKklSt27dbtp2yJAhioyM1OjRo7Vr164m23KzOhC4vLry8de//lVPP/20Kisrdeedd2rEiBHat2+f+vTpI0lKS0vTxYsXNX36dJ0+fVrDhw9XTk6OwsLC/FI8gLbNGKPU1FQ98MADio+Pb7RdZGSkVq9erYSEBNXU1Og//uM/NHr0aOXl5TV6tSQzM1MLFy70V+kA/Mir8LF58+Ym1zscDmVkZCgjI6MlNQFoJ2bOnKkvv/xSn376aZPt4uLiFBcX555PTExUaWmpli5d2mj4SE9PV2pqqnu+urqap+WAAMFvuwDwi1mzZmnbtm3atWuXevXq5fX2I0aM0IkTJxpdz83qQOBq0Q2nAHAjY4xmzZqlrVu3Ki8vTzExMc3aT2FhoSIjI31cHYC2gPABwKdmzJihTZs26cMPP1RYWJj78fvw8HCFhoZK+vEjk1OnTmnDhg2SpKysLPXt21cDBw5UbW2tNm7cqOzsbGVnZ7daPwD4D+EDgE+tXLlSkpScnOyxfO3atZo6daokqaysTCUlJe51tbW1mjdvnk6dOqXQ0FANHDhQ27dv1/jx422VDcAiwgcAnzLG3LTNunXrPObT0tKUlpbmp4oAtDXccAoAAKwifAA
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ax = plt.subplot(1, 2, 1)\n",
"plt.hist(weight, color='blue', edgecolor='black', bins=50)\n",
"ax = plt.subplot(1, 2, 2)\n",
"plt.hist(weight[weight <= 1], color='blue', edgecolor='black', bins=50)\n",
"plt.show()"
2023-06-21 15:46:28 +02:00
]
},
{
"cell_type": "code",
"execution_count": 75,
2023-06-21 15:46:28 +02:00
"id": "e6a61e8d-5c55-4312-a48a-a168817e7d77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 [cardboard]\n",
"2 None\n",
"3 [cardboard]\n",
"4 [metal]\n",
" ... \n",
"162 [plastic]\n",
"163 None\n",
"164 [fabric]\n",
"165 None\n",
"166 [stoneware]\n",
"Name: raw_specifications, Length: 167, dtype: object"
]
},
"execution_count": 75,
2023-06-21 15:46:28 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"materials = data[\"raw_specifications\"].apply(parse_value, value=\"materials\")\n",
"\n",
"# scoreable materials are:\n",
"# * metal\n",
"# * wood\n",
"# * glass\n",
"# * resin\n",
"# * fabric\n",
"# * plastic\n",
"\n",
"def material_classifier(material: str) -> str:\n",
" \"\"\"I will to match materials to one of the scoreable ones:\n",
" * metal\n",
" * wood\n",
" * glass\n",
" * resin\n",
" * fabric\n",
" * plastic\n",
" I found a few, like stoneware and cardboard that I can't fit\n",
" there, they'll have to remain unscored for now\"\"\"\n",
"\n",
" mapping = {\n",
" \"polyester\": \"fabric\",\n",
" \"spandex\": \"fabric\",\n",
" \"leather\": \"fabric\",\n",
" \"cardboard\": \"carboard\",\n",
" \"crystal\": \"glass\",\n",
" \"hardwood\": \"wood\",\n",
" \"plywood\": \"wood\",\n",
" \"mdf\": \"wood\",\n",
" \"wood\": \"wood\",\n",
" \"steel\": \"metal\",\n",
" \"polycarbonate\": \"plastic\",\n",
" \"polypropylene\": \"plastic\",\n",
" \"pvc\": \"plastic\",\n",
" \"resin\": \"plastic\",\n",
" \"stoneware\": \"stoneware\",\n",
" \"cardboard\": \"cardboard\",\n",
" \"paper\": \"cardboard\",\n",
" }\n",
" for key, value in mapping.items():\n",
" if key in material:\n",
" return value\n",
" return material\n",
"\n",
"def clean_material_name(material: str) -> str:\n",
" no_paren_annotations = re.sub(\"\\(.*\\)\", \"\", material)\n",
" no_amounts = re.sub(\"\\d+%?\", \"\", no_paren_annotations)\n",
" return no_amounts.strip().lower()\n",
" \n",
"def parse_materials(materials: str):\n",
" if materials is None:\n",
" return\n",
" material_ls = [\n",
" material_classifier(clean_material_name(x))\n",
" for x in materials.split(\",\")\n",
" ]\n",
" return list(set(material_ls))\n",
"\n",
"clean_materials = materials.apply(parse_materials)\n",
"clean_materials"
]
},
{
"cell_type": "code",
"execution_count": 21,
2023-06-21 15:46:28 +02:00
"id": "f6fcaae9-9c24-450c-8737-8b6aa0e406a9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1\n",
"1 1\n",
"2 1\n",
"3 24\n",
"4 1\n",
" ... \n",
"162 1\n",
"163 1\n",
"164 1\n",
"165 1\n",
"166 1\n",
"Name: raw_specifications, Length: 167, dtype: object"
]
},
"execution_count": 21,
2023-06-21 15:46:28 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"packaging = data[\"raw_specifications\"].apply(parse_value, value=\"packaging\")\n",
"packaging[packaging.isnull()] = 1\n",
"packaging"
]
},
{
"cell_type": "code",
"execution_count": 41,
2023-06-21 15:46:28 +02:00
"id": "f6ed2736-26d2-403f-9ab0-a228057176e4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{NoneType, dict}"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
2023-06-21 15:46:28 +02:00
"source": [
"def clean_origin_name(origin: str) -> str:\n",
" mapping = {\n",
" \"assem usa w/foreign/dom. parts\": \"mixed\",\n",
" \"imported\": \"imported\",\n",
" \"made in the usa\": \"usa\",\n",
" \"made in the usa or imported\": \"mixed\",\n",
" }\n",
" origin = origin.lower().strip()\n",
" return mapping[origin]\n",
"\n",
"origin = data[\"raw_specifications\"].apply(parse_value, value=\"origin\")\n",
"clean_origin = origin.apply(clean_origin_name)\n",
"clean_origin"
2023-06-21 15:46:28 +02:00
]
},
{
"cell_type": "code",
"execution_count": 46,
2023-06-21 15:46:28 +02:00
"id": "d2df0c7e-cd03-4628-ae29-fae4f0f37dfa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>materials</th>\n",
" <th>packaging</th>\n",
" <th>origin</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>width</th>\n",
" <th>depth</th>\n",
" <th>tcin</th>\n",
" <th>primary_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[metal]</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>2109.202800</td>\n",
" <td>58.420</td>\n",
" <td>2.540</td>\n",
" <td>58.420</td>\n",
" <td>86345566</td>\n",
" <td>Home</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[glass]</td>\n",
" <td>1</td>\n",
" <td>mixed</td>\n",
" <td>0.080297</td>\n",
" <td>20.320</td>\n",
" <td>4.445</td>\n",
" <td>4.445</td>\n",
" <td>75552641</td>\n",
" <td>Holiday Shop</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>[wood]</td>\n",
" <td>1</td>\n",
" <td>mixed</td>\n",
" <td>0.275303</td>\n",
" <td>13.335</td>\n",
" <td>10.160</td>\n",
" <td>10.160</td>\n",
" <td>81632725</td>\n",
" <td>Holiday Shop</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>[fabric]</td>\n",
" <td>1</td>\n",
" <td>mixed</td>\n",
" <td>2.726807</td>\n",
" <td>33.020</td>\n",
" <td>20.320</td>\n",
" <td>20.320</td>\n",
" <td>81726538</td>\n",
" <td>Holiday Shop</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>[metal]</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>1360.776000</td>\n",
" <td>74.930</td>\n",
" <td>1.270</td>\n",
" <td>31.115</td>\n",
" <td>86435324</td>\n",
" <td>Home</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
2023-06-21 15:46:28 +02:00
"text/plain": [
" materials packaging origin weight height width depth \\\n",
"4 [metal] 1 imported 2109.202800 58.420 2.540 58.420 \n",
"5 [glass] 1 mixed 0.080297 20.320 4.445 4.445 \n",
"6 [wood] 1 mixed 0.275303 13.335 10.160 10.160 \n",
"11 [fabric] 1 mixed 2.726807 33.020 20.320 20.320 \n",
"15 [metal] 1 imported 1360.776000 74.930 1.270 31.115 \n",
"\n",
" tcin primary_category \n",
"4 86345566 Home \n",
"5 75552641 Holiday Shop \n",
"6 81632725 Holiday Shop \n",
"11 81726538 Holiday Shop \n",
"15 86435324 Home "
2023-06-21 15:46:28 +02:00
]
},
"execution_count": 46,
2023-06-21 15:46:28 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_val(xs, i):\n",
" if xs is None:\n",
" return None\n",
" return xs.get(i)\n",
"\n",
2023-06-21 15:46:28 +02:00
"clean_data = pd.DataFrame(\n",
" data={\n",
" \"materials\": clean_materials,\n",
" \"packaging\": packaging,\n",
" \"origin\": clean_origin,\n",
" \"weight\": weight,\n",
" \"height\": dimensions.apply(get_val, i=\"height\"),\n",
" \"width\": dimensions.apply(get_val, i=\"width\"),\n",
" \"depth\": dimensions.apply(get_val, i=\"depth\"),\n",
" \"tcin\": data[\"raw_specifications\"].apply(parse_value, value=\"tcin\"),\n",
" \"primary_category\": data[\"primary_category\"]\n",
" }\n",
")\n",
"\n",
"clean_data\n",
"\n",
"clean_data[~clean_data.isnull().any(axis=1)].head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "427c5d6a-8f8b-4441-89c5-193d647b49e2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4 23271231140\n",
"Name: gtin13, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weight = data[\"raw_specifications\"].apply(parse_value, value=\"weight\")\n",
"sel = weight == \" 4.65 pounds\"\n",
"data.loc[sel, \"gtin13\"]"
2023-06-21 15:46:28 +02:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}