diff --git a/.gitignore b/.gitignore index e69de29..763513e 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints diff --git a/exploration/work/exploration.ipynb b/exploration/work/exploration.ipynb new file mode 100644 index 0000000..1aa88a8 --- /dev/null +++ b/exploration/work/exploration.ipynb @@ -0,0 +1,1064 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 410, + "id": "98ded03d-0208-4416-a5e5-720a2e0742fa", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import pandas as pd\n", + "from IPython.display import display, HTML\n", + "\n", + "in_file = \"/home/jovyan/data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv\"\n", + "data = pd.read_csv(in_file)\n", + "\n", + "def look_for_matches(data: pd.DataFrame, pattern: str, colname : str = \"raw_specifications\") -> str:\n", + " \"\"\"Useful for finding cells in raw_specifications containing a given string\"\"\"\n", + " return data.loc[data.loc[:, colname].str.contains(pattern), colname].iloc[0]\n", + "\n", + "def render_html(html: str):\n", + " \"\"\"Render an html string\"\"\"\n", + " display(HTML(html))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c20c14ea-9ef6-4d40-8b7d-4731d0866239", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['title', 'url', 'brand', 'main_image', 'sku', 'description',\n", + " 'raw_description', 'gtin13', 'currency', 'price', 'availability',\n", + " 'availableDeliveryMethod', 'available_branch', 'primary_category',\n", + " 'sub_category_1', 'sub_category_2', 'sub_category_3', 'images',\n", + " 'raw_specifications', 'specifications', 'highlights', 'raw_highlights',\n", + " 'uniq_id', 'scraped_at'],\n", + " dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.axes[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "165723d1-8152-4e30-b25a-cbca7f4935f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Suggested Age: 6 Years and Up

CPSC Choking Hazard Warnings: Choking_hazard_small_parts

TCIN: 81917300
UPC: 840391145528
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(data.loc[0, 'raw_specifications'])" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "d153a732-f18c-4fb7-9282-9c02c49c3cc0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 11.9 inches (L), 11.9 inches (W)

Dated format: Monthly

Calendar year: 2022

Material: Paper

TCIN: 84821007
UPC: 9781801433983
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(look_for_matches(data, \"Material\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b06e66af-e0e3-466c-8797-546d5e076f32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Contains: Does Not Contain Any of the 8 Major Allergens

Dietary Needs: Gluten Free

Form: Pieces

State of Readiness: Ready to Eat

Package Quantity: 1

Net weight: 15.6 Ounces

TCIN: 54571204
UPC: 022000279729
Item Number (DPCI): 055-02-1211
Origin: Made in the USA or Imported
Grocery Disclaimer:
Content on this site is for reference purposes only. Target does not represent or warrant that the nutrition, ingredient, allergen and other product information on our Web or Mobile sites are accurate or complete, since this information comes from the product manufacturers. On occasion, manufacturers may improve or change their product formulas and update their labels. We recommend that you do not rely solely on the information presented on our Web or Mobile sites and that you review the product's label or contact the manufacturer directly if you have specific product concerns or questions. If you have specific healthcare concerns or questions about the products displayed, please contact your licensed healthcare professional for advice or answers. Any additional pictures are suggested servings only.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(look_for_matches(data, \"Package\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "015a0d99-b4c2-47cf-9e23-59485a450470", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 23 inches (H) x 1 inches (W) x 23 inches (D)

Weight: 4.65 pounds

Art subject: Geometric Shapes

Orientation: Vertical

Material: Metal

Battery: No Battery Used

TCIN: 86345566
UPC: 023271231140
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(look_for_matches(data, \"Weight\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "99cbfba8-4324-4d2a-b2e2-d766a5e57964", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 11.9 inches (L), 11.9 inches (W)

Dated format: Monthly

Calendar year: 2022

Material: Paper

TCIN: 84821007
UPC: 9781801433983
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(look_for_matches(data, \"Dimensions\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "6c05e1db-7181-422e-924b-e67a28886abe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Suggested Age: 6 Years and Up

CPSC Choking Hazard Warnings: Choking_hazard_small_parts

TCIN: 81917300
UPC: 840391145528
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(look_for_matches(data, \"TCIN\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "bf3a07d9-11c0-48c4-be03-373a3ba5f755", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Suggested Age: 6 Years and Up

CPSC Choking Hazard Warnings: Choking_hazard_small_parts

TCIN: 81917300
UPC: 840391145528
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render_html(look_for_matches(data, \"Origin\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ffdb3244-a20d-427b-9c23-4d8c6cf4ce6d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Material\n", + "raw_specifications\n", + "True 105\n", + "False 62\n", + "Name: count, dtype: int64\n", + "\n", + "Package Quantity\n", + "raw_specifications\n", + "False 143\n", + "True 24\n", + "Name: count, dtype: int64\n", + "\n", + "Weight\n", + "raw_specifications\n", + "False 106\n", + "True 61\n", + "Name: count, dtype: int64\n", + "\n", + "Dimensions\n", + "raw_specifications\n", + "True 108\n", + "False 59\n", + "Name: count, dtype: int64\n", + "\n", + "TCIN\n", + "raw_specifications\n", + "True 167\n", + "Name: count, dtype: int64\n", + "\n", + "Origin\n", + "raw_specifications\n", + "True 167\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "# let's see how many of the specifications are there\n", + "\n", + "colname = \"raw_specifications\"\n", + "patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n", + "\n", + "for pattern in patterns:\n", + " print(pattern)\n", + " print(data.loc[:, colname].str.contains(pattern).value_counts())\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dc44956b-ea8c-44ce-8bc5-e0472273e7a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "raw_specifications\n", + "False 162\n", + "True 5\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's see how many of the specifications are there\n", + "\n", + "colname = \"raw_specifications\"\n", + "patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n", + "\n", + "sels = [data.loc[:, colname].str.contains(pattern) for pattern in patterns]\n", + "sel = functools.reduce(lambda x, y: x & y, sels)\n", + "\n", + "sel.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "107da59a-4bd8-4b0c-9e6e-95f38788eaf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): .1 inches (H) x 6.0 inches (W) x 13.5 inches (D)

Weight: .35 ounces

Package Quantity: 3

Material: Wood

TCIN: 82555842
UPC: 843128185798
Origin: imported
WARNING:⚠ This product can expose you to chemicals including Formaldehyde (gas), which is known to the State of California to cause cancer and birth defects or other reproductive harm. For more information, go to www.P65Warnings.ca.gov www.p65warnings.ca.gov

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 9.02 inches (L), 4.88 inches (H) x 5.0 inches (W)

Weight: 1.63 pounds

Package Quantity: 100

Material: Paper

TCIN: 84236733
UPC: 194425198586
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 1.14 inches (H) x 5.04 inches (W) x 6.3 inches (D)

Weight: .15 ounces

Package Quantity: 12

Material: Wood

TCIN: 82021299
UPC: 194425108806
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 9.79 inches (L), 4.13 inches (W)

Weight: 2.61 ounces

Closure Type: Flap

Package Quantity: 250

Material: Paper

TCIN: 81501830
UPC: 083514896277
Origin: made in the USA

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Specifications

Dimensions (Overall): 2.24 inches (L), 6.0 inches (W)

Weight: 1.6 ounces

Closure Type: Flap

Package Quantity: 100

Material: Cardboard

TCIN: 81843685
UPC: 087547421000
Origin: imported

The above item details were provided by the Target Plus™ Partner. Target does not represent or warrant that this information is accurate or complete. On occasion, manufacturers may modify their items and update their labels.

We recommend that you do not rely solely on the information presented. If you have a specific question about this item, you may consult the item's label, contact the manufacturer directly or call Target Guest Services at 1-800-591-3869.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.target.com/p/genie-crafts-wood-quote-signs-eat-drink-love-wood-letter-signs-drawing-stencils-wall-decor/-/A-82555842\n", + "https://www.target.com/p/sparkle-and-bash-100-pack-gold-foil-initial-letter-k-white-monogram-paper-napkins-for-dinner-party-4-x-8-in/-/A-84236733\n", + "https://www.target.com/p/wooden-rectangles-for-crafts-panel-board-4-x-6-in-12-pack/-/A-82021299\n", + "https://www.target.com/p/southworth-25-cotton-10-business-envelope-ivory-24-lbs-wove-250-box-fsc-j404i10/-/A-81501830\n", + "https://www.target.com/p/universal-self-seal-catalog-envelope-6-x-9-white-100-box-42100/-/A-81843685\n" + ] + } + ], + "source": [ + "for x in data.loc[sel, \"raw_specifications\"]:\n", + " render_html(x)\n", + "for x in data.loc[sel, \"url\"]:\n", + " print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "3853dd07-1977-44c6-bc97-2ecdaa98980e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "raw_specifications\n", + "False 124\n", + "True 43\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "colname = \"raw_specifications\"\n", + "patterns = [\"Material\", \"Package Quantity\", \"Weight\", \"Dimensions\", \"TCIN\", \"Origin\"]\n", + "\n", + "(data.loc[:, colname].str.contains(\"Package Quantity\") | data.loc[:, colname].str.contains(\"Number of Pieces\")).value_counts()\n", + "\n", + "# Package Quantity and Number of Pieces are never found together. Maybe they refer to the same thing?" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "be8a7628-9199-4f25-a533-d041a7ff540c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleurlbrandmain_imageskudescriptionraw_descriptiongtin13currencyprice...sub_category_1sub_category_2sub_category_3imagesraw_specificationsspecificationshighlightsraw_highlightsuniq_idscraped_at
8NCAA Illinois Fighting Illini Circo Cheese Cut...https://www.target.com/p/ncaa-illinois-fightin...NCAAhttps://target.scene7.com/is/image/Target/GUES...79646040Reach out to the complex cheese lover in your ...<div class=\"h-margin-v-default\" data-test=\"ite...99967205276USD58.95...Sports Fan ShopSports Fan Shop Home GoodsSports Fan Shop Barware & Drinkwarehttps://target.scene7.com/is/image/Target/GUES...<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...Number of Pieces: 5 | Number of Pieces: 5 | We...BEAUTY & ELEGANCE - The Circo swivel-style cir...<li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">...0c549116-75c8-56cb-8877-165380d0efd906/12/22
13Blue Panda Jumbo Dinosaur Floor Puzzle, Double...https://www.target.com/p/blue-panda-jumbo-dino...Blue Pandahttps://target.scene7.com/is/image/Target/GUES...80405355Package Includes\\r\\nLarge Dinosaur Floor Puzzl...<div class=\"h-margin-v-default\" data-test=\"ite...194425203808USD19.99...PuzzlesNaNNaNhttps://target.scene7.com/is/image/Target/GUES...<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...Number of Pieces: 17 | Number of Pieces: 17 | ...JUMBO DINOSAUR PUZZLE: This t-rex foam puzzle ...<li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">...151c72b4-4856-502f-a508-961cc81fffa906/12/22
14Women's Round Aviator Sunglasses - Universal T...https://www.target.com/p/women-39-s-round-avia...Universal Threadhttps://target.scene7.com/is/image/Target/GUES...84201225Round out your eyewear collection with the Rou...<div class=\"h-margin-v-default\" data-test=\"ite...195995526496USD15.00...Eye CareNaNNaNhttps://target.scene7.com/is/image/Target/GUES...<div class=\"styles__StyledCol-sc-ct8kx6-0 iKGd...Material: Metal (Frame) | Material: Metal (Fra...Universal Thread round aviator sunglasses with...<li class=\"styles__Bullet-sc-6aebpn-0 eIfLaI\">...2a803c0f-00bf-50a6-a490-d381620ac3a306/12/22
\n", + "

3 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " title \\\n", + "8 NCAA Illinois Fighting Illini Circo Cheese Cut... \n", + "13 Blue Panda Jumbo Dinosaur Floor Puzzle, Double... \n", + "14 Women's Round Aviator Sunglasses - Universal T... \n", + "\n", + " url brand \\\n", + "8 https://www.target.com/p/ncaa-illinois-fightin... NCAA \n", + "13 https://www.target.com/p/blue-panda-jumbo-dino... Blue Panda \n", + "14 https://www.target.com/p/women-39-s-round-avia... Universal Thread \n", + "\n", + " main_image sku \\\n", + "8 https://target.scene7.com/is/image/Target/GUES... 79646040 \n", + "13 https://target.scene7.com/is/image/Target/GUES... 80405355 \n", + "14 https://target.scene7.com/is/image/Target/GUES... 84201225 \n", + "\n", + " description \\\n", + "8 Reach out to the complex cheese lover in your ... \n", + "13 Package Includes\\r\\nLarge Dinosaur Floor Puzzl... \n", + "14 Round out your eyewear collection with the Rou... \n", + "\n", + " raw_description gtin13 currency \\\n", + "8
... \n", + "13
  • ... \n", + "14
  • ... \n", + "\n", + " uniq_id scraped_at \n", + "8 0c549116-75c8-56cb-8877-165380d0efd9 06/12/22 \n", + "13 151c72b4-4856-502f-a508-961cc81fffa9 06/12/22 \n", + "14 2a803c0f-00bf-50a6-a490-d381620ac3a3 06/12/22 \n", + "\n", + "[3 rows x 24 columns]" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sel = data.loc[:, \"raw_specifications\"].str.contains(\"Dimensions\")\n", + "dimensions = data.loc[sel]\n", + "sel_overall = ~dimensions.loc[sel, \"raw_specifications\"].str.contains(\"Overall\")\n", + "#for x in dimensions.loc[sel_overall, \"raw_specifications\"]:\n", + "# render_html(x)\n", + "dimensions.loc[sel_overall]\n", + "\n", + "# looks like \"dimenions\" can be \"Dimensions (Overall)\", \"Dimensions\" or other things\n", + "# like \"Assembled Dimensions\" or \"Piece X Dimensions\". But this latter two options are\n", + "# incomplete (lack the height), harder to parse and rare enough that I'll just drop them" + ] + }, + { + "cell_type": "code", + "execution_count": 359, + "id": "3fd14c61-c1f8-4b18-b04a-9b616e6cf9a7", + "metadata": {}, + "outputs": [], + "source": [ + "# There seem to be dupliucates on \"specifications\" that are not \n", + "# found on \"raw_specifications\"\n", + "# I think it's safe to just remove the duplicates\n", + "\n", + "specifications = data.loc[1,\"specifications\"]\n", + "\n", + "def parse_specs(specifications: str) -> dict[str,str]:\n", + " fields_mapping = {\n", + " \"Material\": \"materials\",\n", + " \"Package Quantity\": \"packaging\",\n", + " \"Number of Pieces\": \"packaging\",\n", + " \"Dimensions (Overall)\": \"dimensions\",\n", + " \"Dimensions\": \"dimensions\",\n", + " \"Weight\": \"weight\",\n", + " \"TCIN\": \"tcin\",\n", + " \"Origin\": \"origin\", \n", + " }\n", + " spec_dict = {}\n", + " for spec in specifications.split(\"|\"):\n", + " if \":\" in spec:\n", + " try:\n", + " field, value = spec.split(\":\")\n", + " except ValueError:\n", + " print(spec)\n", + " return {}\n", + " field = field.strip()\n", + " if field in fields_mapping:\n", + " field = fields_mapping[field]\n", + " spec_dict[field] = value.strip()\n", + " return spec_dict\n", + "\n", + "\n", + "\n", + "def iter_parse(root: ET.Element) -> dict[str,str]:\n", + " \"\"\"Recursively parse the XML tree into a dictionary\n", + " Each key/value pair is inside it's own
    tag and\n", + " the key inside a tag.\n", + " The fields that I believe are compulsory (TCIN, UPC and Origin)\n", + " are only nested one level deep, while the rest of fields seem\n", + " to be always nested two levels deep. But parsing it recursively\n", + " helps generalise both cases.\"\"\"\n", + " \n", + " spec_dict = {}\n", + " for child in root:\n", + " if child.tag == \"div\":\n", + " if \"b\" in [x.tag for x in child]:\n", + " key, *values = child.itertext()\n", + " key = key.strip(\":\")\n", + " value = \"\".join(values).strip(\":\")\n", + " spec_dict[key] = value\n", + " else:\n", + " spec_dict.update(iter_parse(child))\n", + " return spec_dict\n", + "\n", + "def parse_raw_specs(raw_specs: str) -> dict[str,str]:\n", + " \"\"\"Parse a raw specifications XML string into a dictionary\n", + " This involves first recursively parsing the XML tree and then\n", + " renaming the key values\"\"\"\n", + " \n", + " fields_mapping = {\n", + " \"Material\": \"materials\",\n", + " \"Package Quantity\": \"packaging\",\n", + " \"Number of Pieces\": \"packaging\",\n", + " \"Dimensions (Overall)\": \"dimensions\",\n", + " \"Dimensions\": \"dimensions\",\n", + " \"Weight\": \"weight\",\n", + " \"TCIN\": \"tcin\",\n", + " \"Origin\": \"origin\", \n", + " }\n", + " xml_root = ET.fromstring(raw_specs)\n", + " parsed = iter_parse(xml_root)\n", + " specs_dict = {\n", + " fields_mapping[key]: value\n", + " for key, value in parsed.items()\n", + " if key in fields_mapping\n", + " }\n", + " return specs_dict\n", + " \n", + "\n", + "def parse_value(specs: str, value: str) -> str:\n", + " return parse_raw_specs(specs).get(value)\n", + "\n", + "for x in [\"materials\", \"packaging\", \"dimensions\", \"weight\", \"tcin\", \"origin\"]:\n", + " data[x] = data[\"raw_specifications\"].apply(parse_value, value=x)\n", + "\n", + "# set(data[\"material\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 305, + "id": "5902f1b3-a77e-4353-92aa-71cc729e8c87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 None\n", + "1 {'width': 30.226000000000003}\n", + "2 None\n", + "3 None\n", + "4 {'height': 58.42, 'width': 2.54, 'depth': 58.42}\n", + " ... \n", + "162 {'width': 12.7, 'depth': 24.13}\n", + "163 {'height': 30.48, 'width': 30.48}\n", + "164 {'height': 12.065, 'width': 5.715, 'depth': 5....\n", + "165 None\n", + "166 {'height': 11.43, 'width': 31.75, 'depth': 11.43}\n", + "Name: dimensions, Length: 167, dtype: object" + ] + }, + "execution_count": 305, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "from typing import Optional\n", + "\n", + "dimensions = \"16 inches (H) x 23 inches (W) x 23 inches (D)\"\n", + "\n", + "def parse_dimensions_measure(dimensions: str, measure: str) -> Optional[dict[str,str]]:\n", + " expr = f\"(?P\\d*[.,]?\\d*)\\s+(?P[a-zA-Z]*)\\s+\\({measure}\\)\" \n", + " if match := re.search(expr, dimensions):\n", + " return {\n", + " \"value\": float(match.group(\"value\")),\n", + " \"unit\": match.group(\"unit\").lower()\n", + " }\n", + "\n", + "def units_to_cm(value: float, unit: str) -> float:\n", + " conversions = {\n", + " \"inches\": 2.54,\n", + " \"feet\": 30.48,\n", + " \"cm\": 1\n", + " }\n", + " return value * conversions[unit]\n", + "\n", + "\n", + "def parse_dimensions(dimensions: Optional[str]) -> Optional[dict[str,float]]:\n", + " if dimensions is None:\n", + " return None\n", + " height = parse_dimensions_measure(dimensions, \"H\")\n", + " width = parse_dimensions_measure(dimensions, \"W\")\n", + " depth = parse_dimensions_measure(dimensions, \"D\")\n", + " dimensions = {\n", + " \"height\": height,\n", + " \"width\": width,\n", + " \"depth\": depth,\n", + " }\n", + " return {\n", + " key: units_to_cm(**value)\n", + " for key,value in dimensions.items()\n", + " if value is not None\n", + " }\n", + "\n", + "dimensions = data[\"dimensions\"].apply(parse_dimensions)\n", + "dimensions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 341, + "id": "26ece1d2-c466-498c-aeb6-5610fb16c7d3", + "metadata": {}, + "outputs": [], + "source": [ + "def units_to_g(value: float, unit: str) -> float:\n", + " conversions = {\n", + " \"pounds\": 453.592,\n", + " \"ounces\": 28.3495,\n", + " \"g\": 1\n", + " }\n", + " return value * conversions[unit]\n", + "\n", + "def parse_weight(weight: str):\n", + " if weight is None:\n", + " return None\n", + " expr = f\"(?P\\d*[.,]?\\d*)\\s+(?P[a-zA-Z]*)\"\n", + "\n", + " # strip is needed to prevent the regex from lazily\n", + " # matching just from the first whitespace separator,\n", + " # this could happen because the number part in the\n", + " # expression is technically all optional, to avoid\n", + " # an expression too complex and unreadable\n", + " if match := re.search(expr, weight.strip()):\n", + " value = float(match.group(\"value\"))\n", + " unit = match.group(\"unit\").lower()\n", + " return units_to_g(value, unit)\n", + "\n", + " return weight\n", + "\n", + "def calculate_dimensional_weight(dimensions: dict[str,float]):\n", + " \"\"\"The dimensional weight (in kg) is calculated as\n", + " Length * Height * Width (in cm) / 5000.\n", + " We'll return it in g here\"\"\"\n", + " if dimensions is None:\n", + " return None\n", + " \n", + " height = dimensions.get(\"height\")\n", + " width = dimensions.get(\"width\")\n", + " depth = dimensions.get(\"depth\")\n", + " if None in [height, width, depth]:\n", + " return None\n", + " \n", + " return height * width * depth / 5000\n", + "\n", + "weight = data[\"weight\"].apply(parse_weight)\n", + "# weight\n", + "\n", + "dimensional_weight = list(map(calculate_dimensional_weight, dimensions))\n", + "# dimensional_weight\n", + "\n", + "sel = weight.isnull()\n", + "weight[sel] = pd.Series(dimensional_weight)[sel]" + ] + }, + { + "cell_type": "code", + "execution_count": 404, + "id": "e6a61e8d-5c55-4312-a48a-a168817e7d77", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 None\n", + "1 [cardboard]\n", + "2 None\n", + "3 [cardboard]\n", + "4 [metal]\n", + " ... \n", + "162 [plastic]\n", + "163 None\n", + "164 [fabric]\n", + "165 None\n", + "166 [stoneware]\n", + "Name: raw_specifications, Length: 167, dtype: object" + ] + }, + "execution_count": 404, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "materials = data[\"raw_specifications\"].apply(parse_value, value=\"materials\")\n", + "\n", + "# scoreable materials are:\n", + "# * metal\n", + "# * wood\n", + "# * glass\n", + "# * resin\n", + "# * fabric\n", + "# * plastic\n", + "\n", + "def material_classifier(material: str) -> str:\n", + " \"\"\"I will to match materials to one of the scoreable ones:\n", + " * metal\n", + " * wood\n", + " * glass\n", + " * resin\n", + " * fabric\n", + " * plastic\n", + " I found a few, like stoneware and cardboard that I can't fit\n", + " there, they'll have to remain unscored for now\"\"\"\n", + "\n", + " mapping = {\n", + " \"polyester\": \"fabric\",\n", + " \"spandex\": \"fabric\",\n", + " \"leather\": \"fabric\",\n", + " \"cardboard\": \"carboard\",\n", + " \"crystal\": \"glass\",\n", + " \"hardwood\": \"wood\",\n", + " \"plywood\": \"wood\",\n", + " \"mdf\": \"wood\",\n", + " \"wood\": \"wood\",\n", + " \"steel\": \"metal\",\n", + " \"polycarbonate\": \"plastic\",\n", + " \"polypropylene\": \"plastic\",\n", + " \"pvc\": \"plastic\",\n", + " \"resin\": \"plastic\",\n", + " \"stoneware\": \"stoneware\",\n", + " \"cardboard\": \"cardboard\",\n", + " \"paper\": \"cardboard\",\n", + " }\n", + " for key, value in mapping.items():\n", + " if key in material:\n", + " return value\n", + " return material\n", + "\n", + "def clean_material_name(material: str) -> str:\n", + " no_paren_annotations = re.sub(\"\\(.*\\)\", \"\", material)\n", + " no_amounts = re.sub(\"\\d+%?\", \"\", no_paren_annotations)\n", + " return no_amounts.strip().lower()\n", + " \n", + "def parse_materials(materials: str):\n", + " if materials is None:\n", + " return\n", + " material_ls = [\n", + " material_classifier(clean_material_name(x))\n", + " for x in materials.split(\",\")\n", + " ]\n", + " return list(set(material_ls))\n", + "\n", + "clean_materials = materials.apply(parse_materials)\n", + "clean_materials" + ] + }, + { + "cell_type": "code", + "execution_count": 419, + "id": "f6fcaae9-9c24-450c-8737-8b6aa0e406a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 1\n", + "2 1\n", + "3 24\n", + "4 1\n", + " ... \n", + "162 1\n", + "163 1\n", + "164 1\n", + "165 1\n", + "166 1\n", + "Name: raw_specifications, Length: 167, dtype: object" + ] + }, + "execution_count": 419, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packaging = data[\"raw_specifications\"].apply(parse_value, value=\"packaging\")\n", + "packaging[packaging.isnull()] = 1\n", + "packaging" + ] + }, + { + "cell_type": "code", + "execution_count": 429, + "id": "f6ed2736-26d2-403f-9ab0-a228057176e4", + "metadata": {}, + "outputs": [], + "source": [ + "def clean_origin_name(origin: str) -> str:\n", + " mapping = {\n", + " \"assem usa w/foreign/dom. parts\": \"mixed\",\n", + " \"imported\": \"imported\",\n", + " \"made in the usa\": \"usa\",\n", + " \"made in the usa or imported\": \"mixed\",\n", + " }\n", + " origin = origin.lower().strip()\n", + " return mapping[origin]\n", + "\n", + "origin = data[\"raw_specifications\"].apply(parse_value, value=\"origin\")\n", + "clean_origin = origin.apply(clean_origin_name)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 453, + "id": "d2df0c7e-cd03-4628-ae29-fae4f0f37dfa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(57, 9)" + ] + }, + "execution_count": 453, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_data = pd.DataFrame(\n", + " data={\n", + " \"materials\": clean_materials,\n", + " \"packaging\": packaging,\n", + " \"origin\": clean_origin,\n", + " \"weight\": weight,\n", + " \"height\": dimensions.apply(get_val, i=\"height\"),\n", + " \"width\": dimensions.apply(get_val, i=\"width\"),\n", + " \"depth\": dimensions.apply(get_val, i=\"depth\"),\n", + " \"tcin\": data[\"raw_specifications\"].apply(parse_value, value=\"tcin\"),\n", + " \"primary_category\": data[\"primary_category\"]\n", + " }\n", + ")\n", + "\n", + "clean_data\n", + "\n", + "clean_data[~clean_data.isnull().any(axis=1)].shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}