feat: added analysis notebook

main
Ricard Illa 2023-06-25 23:24:54 +02:00
parent 900955d92d
commit 15178b0b3c
No known key found for this signature in database
GPG Key ID: F69A672B72E54902
1 changed files with 391 additions and 0 deletions

391
notebooks/analysis.ipynb Normal file
View File

@ -0,0 +1,391 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"id": "a0dcf44b-c609-4701-8007-b270cf8c3d35",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tcin</th>\n",
" <th>gtin13</th>\n",
" <th>ingestion_time</th>\n",
" <th>primary_category</th>\n",
" <th>materials</th>\n",
" <th>packaging</th>\n",
" <th>origin</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>width</th>\n",
" <th>depth</th>\n",
" <th>ingestion_time</th>\n",
" <th>material_score</th>\n",
" <th>weight_score</th>\n",
" <th>packaging_score</th>\n",
" <th>origin_score</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>81917300</td>\n",
" <td>840391145528</td>\n",
" <td>2023-06-25 20:31:00.725924</td>\n",
" <td>Toys</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2023-06-25 20:31:00.725924</td>\n",
" <td>0.625000</td>\n",
" <td>NaN</td>\n",
" <td>0.6</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>84821007</td>\n",
" <td>9781801433983</td>\n",
" <td>2023-06-25 20:31:00.736690</td>\n",
" <td>School &amp; Office Supplies</td>\n",
" <td>[cardboard]</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>30.23</td>\n",
" <td>NaN</td>\n",
" <td>2023-06-25 20:31:00.736690</td>\n",
" <td>0.253333</td>\n",
" <td>NaN</td>\n",
" <td>0.6</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15432753</td>\n",
" <td>883929408115</td>\n",
" <td>2023-06-25 20:31:00.742077</td>\n",
" <td>Movies, Music &amp; Books</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>usa</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2023-06-25 20:31:00.742077</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.6</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>84199597</td>\n",
" <td>194425194489</td>\n",
" <td>2023-06-25 20:31:00.746501</td>\n",
" <td>Party Supplies</td>\n",
" <td>[cardboard]</td>\n",
" <td>24</td>\n",
" <td>imported</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2023-06-25 20:31:00.746501</td>\n",
" <td>0.625000</td>\n",
" <td>NaN</td>\n",
" <td>14.4</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>86345566</td>\n",
" <td>23271231140</td>\n",
" <td>2023-06-25 20:31:00.751118</td>\n",
" <td>Home</td>\n",
" <td>[metal]</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>2109.20</td>\n",
" <td>58.42</td>\n",
" <td>2.54</td>\n",
" <td>58.42</td>\n",
" <td>2023-06-25 20:31:00.751118</td>\n",
" <td>0.353333</td>\n",
" <td>1581.9000</td>\n",
" <td>0.6</td>\n",
" <td>0.0</td>\n",
" <td>1582.853333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>83388852</td>\n",
" <td>4717592035292</td>\n",
" <td>2023-06-25 20:31:01.380622</td>\n",
" <td>Sports &amp; Outdoors</td>\n",
" <td>[plastic]</td>\n",
" <td>1</td>\n",
" <td>mixed</td>\n",
" <td>127.01</td>\n",
" <td>NaN</td>\n",
" <td>12.70</td>\n",
" <td>24.13</td>\n",
" <td>2023-06-25 20:31:01.380622</td>\n",
" <td>0.366667</td>\n",
" <td>95.2575</td>\n",
" <td>0.6</td>\n",
" <td>0.5</td>\n",
" <td>96.724167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>163</th>\n",
" <td>80836585</td>\n",
" <td>841821016982</td>\n",
" <td>2023-06-25 20:31:01.384865</td>\n",
" <td>Patio &amp; Garden</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>mixed</td>\n",
" <td>14514.94</td>\n",
" <td>30.48</td>\n",
" <td>30.48</td>\n",
" <td>NaN</td>\n",
" <td>2023-06-25 20:31:01.384865</td>\n",
" <td>0.112500</td>\n",
" <td>10886.2050</td>\n",
" <td>0.6</td>\n",
" <td>0.5</td>\n",
" <td>10887.417500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>164</th>\n",
" <td>75477923</td>\n",
" <td>93422863070</td>\n",
" <td>2023-06-25 20:31:01.388505</td>\n",
" <td>Holiday Shop</td>\n",
" <td>[fabric]</td>\n",
" <td>1</td>\n",
" <td>mixed</td>\n",
" <td>78.64</td>\n",
" <td>12.06</td>\n",
" <td>5.71</td>\n",
" <td>5.71</td>\n",
" <td>2023-06-25 20:31:01.388505</td>\n",
" <td>0.403571</td>\n",
" <td>58.9800</td>\n",
" <td>0.6</td>\n",
" <td>0.5</td>\n",
" <td>60.483571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>165</th>\n",
" <td>85634544</td>\n",
" <td>194425213968</td>\n",
" <td>2023-06-25 20:31:01.391389</td>\n",
" <td>Household Essentials</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2023-06-25 20:31:01.391389</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.6</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>80239765</td>\n",
" <td>724235717129</td>\n",
" <td>2023-06-25 20:31:01.394481</td>\n",
" <td>Kitchen &amp; Dining</td>\n",
" <td>[stoneware]</td>\n",
" <td>1</td>\n",
" <td>imported</td>\n",
" <td>829.60</td>\n",
" <td>11.43</td>\n",
" <td>31.75</td>\n",
" <td>11.43</td>\n",
" <td>2023-06-25 20:31:01.394481</td>\n",
" <td>NaN</td>\n",
" <td>622.2000</td>\n",
" <td>0.6</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>167 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" tcin gtin13 ingestion_time \\\n",
"0 81917300 840391145528 2023-06-25 20:31:00.725924 \n",
"1 84821007 9781801433983 2023-06-25 20:31:00.736690 \n",
"2 15432753 883929408115 2023-06-25 20:31:00.742077 \n",
"3 84199597 194425194489 2023-06-25 20:31:00.746501 \n",
"4 86345566 23271231140 2023-06-25 20:31:00.751118 \n",
".. ... ... ... \n",
"162 83388852 4717592035292 2023-06-25 20:31:01.380622 \n",
"163 80836585 841821016982 2023-06-25 20:31:01.384865 \n",
"164 75477923 93422863070 2023-06-25 20:31:01.388505 \n",
"165 85634544 194425213968 2023-06-25 20:31:01.391389 \n",
"166 80239765 724235717129 2023-06-25 20:31:01.394481 \n",
"\n",
" primary_category materials packaging origin weight \\\n",
"0 Toys None 1 imported NaN \n",
"1 School & Office Supplies [cardboard] 1 imported NaN \n",
"2 Movies, Music & Books None 1 usa NaN \n",
"3 Party Supplies [cardboard] 24 imported NaN \n",
"4 Home [metal] 1 imported 2109.20 \n",
".. ... ... ... ... ... \n",
"162 Sports & Outdoors [plastic] 1 mixed 127.01 \n",
"163 Patio & Garden None 1 mixed 14514.94 \n",
"164 Holiday Shop [fabric] 1 mixed 78.64 \n",
"165 Household Essentials None 1 imported NaN \n",
"166 Kitchen & Dining [stoneware] 1 imported 829.60 \n",
"\n",
" height width depth ingestion_time material_score \\\n",
"0 NaN NaN NaN 2023-06-25 20:31:00.725924 0.625000 \n",
"1 NaN 30.23 NaN 2023-06-25 20:31:00.736690 0.253333 \n",
"2 NaN NaN NaN 2023-06-25 20:31:00.742077 NaN \n",
"3 NaN NaN NaN 2023-06-25 20:31:00.746501 0.625000 \n",
"4 58.42 2.54 58.42 2023-06-25 20:31:00.751118 0.353333 \n",
".. ... ... ... ... ... \n",
"162 NaN 12.70 24.13 2023-06-25 20:31:01.380622 0.366667 \n",
"163 30.48 30.48 NaN 2023-06-25 20:31:01.384865 0.112500 \n",
"164 12.06 5.71 5.71 2023-06-25 20:31:01.388505 0.403571 \n",
"165 NaN NaN NaN 2023-06-25 20:31:01.391389 NaN \n",
"166 11.43 31.75 11.43 2023-06-25 20:31:01.394481 NaN \n",
"\n",
" weight_score packaging_score origin_score score \n",
"0 NaN 0.6 0.0 NaN \n",
"1 NaN 0.6 0.0 NaN \n",
"2 NaN 0.6 1.0 NaN \n",
"3 NaN 14.4 0.0 NaN \n",
"4 1581.9000 0.6 0.0 1582.853333 \n",
".. ... ... ... ... \n",
"162 95.2575 0.6 0.5 96.724167 \n",
"163 10886.2050 0.6 0.5 10887.417500 \n",
"164 58.9800 0.6 0.5 60.483571 \n",
"165 NaN 0.6 0.0 NaN \n",
"166 622.2000 0.6 0.0 NaN \n",
"\n",
"[167 rows x 17 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sqlalchemy import create_engine\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"engine = create_engine('postgresql://sustainability_score:sustainability_score@postgres:5432/sustainability_score')\n",
"\n",
"query = \"\"\"\n",
" SELECT *\n",
" FROM sustainability_score.products AS products\n",
" JOIN sustainability_score.scored_products AS scores\n",
" USING (tcin);\n",
"\"\"\"\n",
"\n",
"products = pd.read_sql_query(query, engine)\n",
"products"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f00acc1-4dec-45f9-9e38-dcae2b7a271d",
"metadata": {},
"outputs": [],
"source": [
"ax = plt.subplot(1, 2, 1)\n",
"plt.hist(weight, color='blue', edgecolor='black', bins=50)\n",
"ax = plt.subplot(1, 2, 2)\n",
"plt.hist(weight[weight <= 1], color='blue', edgecolor='black', bins=50)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}