In [410]:
import functools
import pandas as pd
from IPython.display import display, HTML

in_file = "/home/jovyan/data/large_target_store_products_dataset_sample - large_target_store_products_dataset_sample.csv"
data = pd.read_csv(in_file)

def look_for_matches(data: pd.DataFrame, pattern: str, colname : str = "raw_specifications") -> str:
    """Useful for finding cells in raw_specifications containing a given string"""
    return data.loc[data.loc[:, colname].str.contains(pattern), colname].iloc[0]

def render_html(html: str):
    """Render an html string"""
    display(HTML(html))

In [9]:
data.axes[1]

Index(['title', 'url', 'brand', 'main_image', 'sku', 'description',
       'raw_description', 'gtin13', 'currency', 'price', 'availability',
       'availableDeliveryMethod', 'available_branch', 'primary_category',
       'sub_category_1', 'sub_category_2', 'sub_category_3', 'images',
       'raw_specifications', 'specifications', 'highlights', 'raw_highlights',
       'uniq_id', 'scraped_at'],
      dtype='object')

In [132]:
render_html(data.loc[0, 'raw_specifications'])

In [134]:
render_html(look_for_matches(data, "Material"))

In [24]:
render_html(look_for_matches(data, "Package"))

In [137]:
render_html(look_for_matches(data, "Weight"))

In [138]:
render_html(look_for_matches(data, "Dimensions"))

In [139]:
render_html(look_for_matches(data, "TCIN"))

In [140]:
render_html(look_for_matches(data, "Origin"))

In [31]:
# let's see how many of the specifications are there

colname = "raw_specifications"
patterns = ["Material", "Package Quantity", "Weight", "Dimensions", "TCIN", "Origin"]

for pattern in patterns:
    print(pattern)
    print(data.loc[:, colname].str.contains(pattern).value_counts())
    print()

Material
raw_specifications
True     105
False     62
Name: count, dtype: int64

Package Quantity
raw_specifications
False    143
True      24
Name: count, dtype: int64

Weight
raw_specifications
False    106
True      61
Name: count, dtype: int64

Dimensions
raw_specifications
True     108
False     59
Name: count, dtype: int64

TCIN
raw_specifications
True    167
Name: count, dtype: int64

Origin
raw_specifications
True    167
Name: count, dtype: int64



In [18]:
# let's see how many of the specifications are there

colname = "raw_specifications"
patterns = ["Material", "Package Quantity", "Weight", "Dimensions", "TCIN", "Origin"]

sels = [data.loc[:, colname].str.contains(pattern) for pattern in patterns]
sel = functools.reduce(lambda x, y: x & y, sels)

sel.value_counts()

raw_specifications
False    162
True       5
Name: count, dtype: int64

In [29]:
for x in data.loc[sel, "raw_specifications"]:
    render_html(x)
for x in data.loc[sel, "url"]:
    print(x)

https://www.target.com/p/genie-crafts-wood-quote-signs-eat-drink-love-wood-letter-signs-drawing-stencils-wall-decor/-/A-82555842
https://www.target.com/p/sparkle-and-bash-100-pack-gold-foil-initial-letter-k-white-monogram-paper-napkins-for-dinner-party-4-x-8-in/-/A-84236733
https://www.target.com/p/wooden-rectangles-for-crafts-panel-board-4-x-6-in-12-pack/-/A-82021299
https://www.target.com/p/southworth-25-cotton-10-business-envelope-ivory-24-lbs-wove-250-box-fsc-j404i10/-/A-81501830
https://www.target.com/p/universal-self-seal-catalog-envelope-6-x-9-white-100-box-42100/-/A-81843685


In [67]:
colname = "raw_specifications"
patterns = ["Material", "Package Quantity", "Weight", "Dimensions", "TCIN", "Origin"]

(data.loc[:, colname].str.contains("Package Quantity") | data.loc[:, colname].str.contains("Number of Pieces")).value_counts()

# Package Quantity and Number of Pieces are never found together. Maybe they refer to the same thing?

raw_specifications
False    124
True      43
Name: count, dtype: int64

In [86]:
sel = data.loc[:, "raw_specifications"].str.contains("Dimensions")
dimensions = data.loc[sel]
sel_overall = ~dimensions.loc[sel, "raw_specifications"].str.contains("Overall")
#for x in dimensions.loc[sel_overall, "raw_specifications"]:
#    render_html(x)
dimensions.loc[sel_overall]

# looks like "dimenions" can be "Dimensions (Overall)", "Dimensions" or other things
# like "Assembled Dimensions" or "Piece X Dimensions". But this latter two options are
# incomplete (lack the height), harder to parse and rare enough that I'll just drop them

Unnamed: 0,title,url,brand,main_image,sku,description,raw_description,gtin13,currency,price,...,sub_category_1,sub_category_2,sub_category_3,images,raw_specifications,specifications,highlights,raw_highlights,uniq_id,scraped_at
8,NCAA Illinois Fighting Illini Circo Cheese Cut...,https://www.target.com/p/ncaa-illinois-fightin...,NCAA,https://target.scene7.com/is/image/Target/GUES...,79646040,Reach out to the complex cheese lover in your ...,"<div class=""h-margin-v-default"" data-test=""ite...",99967205276,USD,58.95,...,Sports Fan Shop,Sports Fan Shop Home Goods,Sports Fan Shop Barware & Drinkware,https://target.scene7.com/is/image/Target/GUES...,"<div class=""styles__StyledCol-sc-ct8kx6-0 iKGd...",Number of Pieces: 5 | Number of Pieces: 5 | We...,BEAUTY & ELEGANCE - The Circo swivel-style cir...,"<li class=""styles__Bullet-sc-6aebpn-0 eIfLaI"">...",0c549116-75c8-56cb-8877-165380d0efd9,06/12/22
13,"Blue Panda Jumbo Dinosaur Floor Puzzle, Double...",https://www.target.com/p/blue-panda-jumbo-dino...,Blue Panda,https://target.scene7.com/is/image/Target/GUES...,80405355,Package Includes\r\nLarge Dinosaur Floor Puzzl...,"<div class=""h-margin-v-default"" data-test=""ite...",194425203808,USD,19.99,...,Puzzles,,,https://target.scene7.com/is/image/Target/GUES...,"<div class=""styles__StyledCol-sc-ct8kx6-0 iKGd...",Number of Pieces: 17 | Number of Pieces: 17 | ...,JUMBO DINOSAUR PUZZLE: This t-rex foam puzzle ...,"<li class=""styles__Bullet-sc-6aebpn-0 eIfLaI"">...",151c72b4-4856-502f-a508-961cc81fffa9,06/12/22
14,Women's Round Aviator Sunglasses - Universal T...,https://www.target.com/p/women-39-s-round-avia...,Universal Thread,https://target.scene7.com/is/image/Target/GUES...,84201225,Round out your eyewear collection with the Rou...,"<div class=""h-margin-v-default"" data-test=""ite...",195995526496,USD,15.0,...,Eye Care,,,https://target.scene7.com/is/image/Target/GUES...,"<div class=""styles__StyledCol-sc-ct8kx6-0 iKGd...",Material: Metal (Frame) | Material: Metal (Fra...,Universal Thread round aviator sunglasses with...,"<li class=""styles__Bullet-sc-6aebpn-0 eIfLaI"">...",2a803c0f-00bf-50a6-a490-d381620ac3a3,06/12/22


In [359]:
# There seem to be dupliucates on "specifications" that are not 
# found on "raw_specifications"
# I think it's safe to just remove the duplicates

specifications = data.loc[1,"specifications"]

def parse_specs(specifications: str) -> dict[str,str]:
    fields_mapping = {
        "Material": "materials",
        "Package Quantity": "packaging",
        "Number of Pieces": "packaging",
        "Dimensions (Overall)": "dimensions",
        "Dimensions": "dimensions",
        "Weight": "weight",
        "TCIN": "tcin",
        "Origin": "origin",        
    }
    spec_dict = {}
    for spec in specifications.split("|"):
        if ":" in spec:
            try:
                field, value = spec.split(":")
            except ValueError:
                print(spec)
                return {}
            field = field.strip()
            if field in fields_mapping:
                field = fields_mapping[field]
                spec_dict[field] = value.strip()
    return spec_dict



def iter_parse(root: ET.Element) -> dict[str,str]:
    """Recursively parse the XML tree into a dictionary
    Each key/value pair is inside it's own <div> tag and
    the key inside a <b> tag.
    The fields that I believe are compulsory (TCIN, UPC and Origin)
    are only nested one level deep, while the rest of fields seem
    to be always nested two levels deep. But parsing it recursively
    helps generalise both cases."""
    
    spec_dict = {}
    for child in root:
        if child.tag == "div":
            if "b" in [x.tag for x in child]:
                key, *values = child.itertext()
                key = key.strip(":")
                value = "".join(values).strip(":")
                spec_dict[key] = value
            else:
                spec_dict.update(iter_parse(child))
    return spec_dict

def parse_raw_specs(raw_specs: str) -> dict[str,str]:
    """Parse a raw specifications XML string into a dictionary
    This involves first recursively parsing the XML tree and then
    renaming the key values"""
    
    fields_mapping = {
        "Material": "materials",
        "Package Quantity": "packaging",
        "Number of Pieces": "packaging",
        "Dimensions (Overall)": "dimensions",
        "Dimensions": "dimensions",
        "Weight": "weight",
        "TCIN": "tcin",
        "Origin": "origin",        
    }
    xml_root = ET.fromstring(raw_specs)
    parsed = iter_parse(xml_root)
    specs_dict = {
        fields_mapping[key]: value
        for key, value in parsed.items()
        if key in fields_mapping
    }
    return specs_dict
        

def parse_value(specs: str, value: str) -> str:
    return parse_raw_specs(specs).get(value)

for x in ["materials", "packaging", "dimensions", "weight", "tcin", "origin"]:
    data[x] = data["raw_specifications"].apply(parse_value, value=x)

# set(data["material"])


In [305]:
import re
from typing import Optional

dimensions = "16 inches (H) x 23 inches (W) x 23 inches (D)"

def parse_dimensions_measure(dimensions: str, measure: str) -> Optional[dict[str,str]]:
    expr = f"(?P<value>\d*[.,]?\d*)\s+(?P<unit>[a-zA-Z]*)\s+\({measure}\)" 
    if match := re.search(expr, dimensions):
        return {
            "value": float(match.group("value")),
            "unit": match.group("unit").lower()
        }

def units_to_cm(value: float, unit: str) -> float:
    conversions = {
        "inches": 2.54,
        "feet": 30.48,
        "cm": 1
    }
    return value * conversions[unit]


def parse_dimensions(dimensions: Optional[str]) -> Optional[dict[str,float]]:
    if dimensions is None:
        return None
    height = parse_dimensions_measure(dimensions, "H")
    width = parse_dimensions_measure(dimensions, "W")
    depth = parse_dimensions_measure(dimensions, "D")
    dimensions = {
        "height": height,
        "width": width,
        "depth": depth,
    }
    return {
        key: units_to_cm(**value)
        for key,value in dimensions.items()
        if value is not None
    }

dimensions = data["dimensions"].apply(parse_dimensions)
dimensions


0                                                   None
1                          {'width': 30.226000000000003}
2                                                   None
3                                                   None
4       {'height': 58.42, 'width': 2.54, 'depth': 58.42}
                             ...                        
162                      {'width': 12.7, 'depth': 24.13}
163                    {'height': 30.48, 'width': 30.48}
164    {'height': 12.065, 'width': 5.715, 'depth': 5....
165                                                 None
166    {'height': 11.43, 'width': 31.75, 'depth': 11.43}
Name: dimensions, Length: 167, dtype: object

In [341]:
def units_to_g(value: float, unit: str) -> float:
    conversions = {
        "pounds": 453.592,
        "ounces": 28.3495,
        "g": 1
    }
    return value * conversions[unit]

def parse_weight(weight: str):
    if weight is None:
        return None
    expr = f"(?P<value>\d*[.,]?\d*)\s+(?P<unit>[a-zA-Z]*)"

    # strip is needed to prevent the regex from lazily
    # matching just from the first whitespace separator,
    # this could happen because the number part in the
    # expression is technically all optional, to avoid
    # an expression too complex and unreadable
    if match := re.search(expr, weight.strip()):
        value = float(match.group("value"))
        unit = match.group("unit").lower()
        return units_to_g(value, unit)

    return weight

def calculate_dimensional_weight(dimensions: dict[str,float]):
    """The dimensional weight (in kg) is calculated as
    Length * Height * Width (in cm) / 5000.
    We'll return it in g here"""
    if dimensions is None:
        return None
        
    height = dimensions.get("height")
    width = dimensions.get("width")
    depth = dimensions.get("depth")
    if None in [height, width, depth]:
        return None
        
    return height * width * depth / 5000

weight = data["weight"].apply(parse_weight)
# weight

dimensional_weight = list(map(calculate_dimensional_weight, dimensions))
# dimensional_weight

sel = weight.isnull()
weight[sel] = pd.Series(dimensional_weight)[sel]

In [404]:
materials = data["raw_specifications"].apply(parse_value, value="materials")

# scoreable materials are:
# * metal
# * wood
# * glass
# * resin
# * fabric
# * plastic

def material_classifier(material: str) -> str:
    """I will to match materials to one of the scoreable ones:
      * metal
      * wood
      * glass
      * resin
      * fabric
      * plastic
    I found a few, like stoneware and cardboard that I can't fit
    there, they'll have to remain unscored for now"""

    mapping = {
        "polyester": "fabric",
        "spandex": "fabric",
        "leather": "fabric",
        "cardboard": "carboard",
        "crystal": "glass",
        "hardwood": "wood",
        "plywood": "wood",
        "mdf": "wood",
        "wood": "wood",
        "steel": "metal",
        "polycarbonate": "plastic",
        "polypropylene": "plastic",
        "pvc": "plastic",
        "resin": "plastic",
        "stoneware": "stoneware",
        "cardboard": "cardboard",
        "paper": "cardboard",
    }
    for key, value in mapping.items():
        if key in material:
            return value
    return material

def clean_material_name(material: str) -> str:
    no_paren_annotations = re.sub("\(.*\)", "", material)
    no_amounts = re.sub("\d+%?", "", no_paren_annotations)
    return no_amounts.strip().lower()
    
def parse_materials(materials: str):
    if materials is None:
        return
    material_ls = [
        material_classifier(clean_material_name(x))
        for x in materials.split(",")
    ]
    return list(set(material_ls))

clean_materials = materials.apply(parse_materials)
clean_materials

0             None
1      [cardboard]
2             None
3      [cardboard]
4          [metal]
          ...     
162      [plastic]
163           None
164       [fabric]
165           None
166    [stoneware]
Name: raw_specifications, Length: 167, dtype: object

In [419]:
packaging = data["raw_specifications"].apply(parse_value, value="packaging")
packaging[packaging.isnull()] = 1
packaging

0        1
1        1
2        1
3       24
4        1
      ... 
162      1
163      1
164      1
165      1
166      1
Name: raw_specifications, Length: 167, dtype: object

In [429]:
def clean_origin_name(origin: str) -> str:
    mapping = {
        "assem usa w/foreign/dom. parts": "mixed",
        "imported": "imported",
        "made in the usa": "usa",
        "made in the usa or imported": "mixed",
    }
    origin = origin.lower().strip()
    return mapping[origin]

origin = data["raw_specifications"].apply(parse_value, value="origin")
clean_origin = origin.apply(clean_origin_name)


In [453]:
clean_data = pd.DataFrame(
    data={
        "materials": clean_materials,
        "packaging": packaging,
        "origin": clean_origin,
        "weight": weight,
        "height": dimensions.apply(get_val, i="height"),
        "width": dimensions.apply(get_val, i="width"),
        "depth": dimensions.apply(get_val, i="depth"),
        "tcin": data["raw_specifications"].apply(parse_value, value="tcin"),
        "primary_category": data["primary_category"]
    }
)

clean_data

clean_data[~clean_data.isnull().any(axis=1)].shape

(57, 9)