Delete node2vecTA.ipynb

parent b7411b5c
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_csv(\"C:/Users/Agusti Frananda/Documents/PROYEK/myntra-mens-product-dataset/men-products.csv\", delimiter=',', index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 61456 entries, 1 to 61503\n",
"Data columns (total 10 columns):\n",
"NAME 61455 non-null object\n",
"CATEGORY 61456 non-null object\n",
"DESCRIPTION & COLOR 61456 non-null object\n",
"FABRIC 56623 non-null object\n",
"IMAGE 61456 non-null object\n",
"SIZE 57618 non-null object\n",
"PRICE 61456 non-null object\n",
"PRODUCT ID 61456 non-null int64\n",
"WEBSITE 61456 non-null object\n",
"PRODUCT URL 61456 non-null object\n",
"dtypes: int64(1), object(9)\n",
"memory usage: 5.2+ MB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>total_missing</th>\n",
" <th>percent_missing</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>NAME</td>\n",
" <td>1</td>\n",
" <td>0.005</td>\n",
" </tr>\n",
" <tr>\n",
" <td>CATEGORY</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>DESCRIPTION &amp; COLOR</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>FABRIC</td>\n",
" <td>4833</td>\n",
" <td>24.165</td>\n",
" </tr>\n",
" <tr>\n",
" <td>IMAGE</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>SIZE</td>\n",
" <td>3838</td>\n",
" <td>19.190</td>\n",
" </tr>\n",
" <tr>\n",
" <td>PRICE</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>PRODUCT ID</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>WEBSITE</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>PRODUCT URL</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" total_missing percent_missing\n",
"NAME 1 0.005\n",
"CATEGORY 0 0.000\n",
"DESCRIPTION & COLOR 0 0.000\n",
"FABRIC 4833 24.165\n",
"IMAGE 0 0.000\n",
"SIZE 3838 19.190\n",
"PRICE 0 0.000\n",
"PRODUCT ID 0 0.000\n",
"WEBSITE 0 0.000\n",
"PRODUCT URL 0 0.000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'percent_missing': (data.isnull().sum()/20000)*100})\n",
"missing_data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NAME</th>\n",
" <th>CATEGORY</th>\n",
" <th>DESCRIPTION &amp; COLOR</th>\n",
" <th>FABRIC</th>\n",
" <th>IMAGE</th>\n",
" <th>SIZE</th>\n",
" <th>PRICE</th>\n",
" <th>PRODUCT ID</th>\n",
" <th>WEBSITE</th>\n",
" <th>PRODUCT URL</th>\n",
" </tr>\n",
" <tr>\n",
" <th>SERIAL NO</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>U.S. Polo Assn. Men Brown Genuine Leather Two ...</td>\n",
" <td>accessories</td>\n",
" <td>U.S. Polo Assn. Men Brown Genuine Leather Two ...</td>\n",
" <td>Genuine leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Height: 11.5 cm</td>\n",
" <td>809</td>\n",
" <td>1943420</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/wallets/us-polo-assn/us...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>Baggit Men Black Solid Two Fold Wallet</td>\n",
" <td>accessories</td>\n",
" <td>Baggit Men Black Solid Two Fold Wallet, Baggi...</td>\n",
" <td>PU</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Height:</td>\n",
" <td>720</td>\n",
" <td>4608404</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/wallets/baggit/baggit-m...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>HRX by Hrithik Roshan Men Grey Solid Baseball Cap</td>\n",
" <td>accessories</td>\n",
" <td>HRX By Hrithik Roshan Men Grey Solid Baseball ...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>279</td>\n",
" <td>2178513</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/caps/hrx-by-hrithik-ros...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>Puma Unisex Grey Style Military Solid Baseball...</td>\n",
" <td>accessories</td>\n",
" <td>Puma Unisex Grey Style Military Solid Baseball...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>499</td>\n",
" <td>6699035</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/caps/puma/puma-unisex-g...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>FabSeasons Beige Solid Scarf</td>\n",
" <td>accessories</td>\n",
" <td>FabSeasons Beige Solid Scarf, FabSeasons, Scar...</td>\n",
" <td>Acrylic</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Length:0.9 m</td>\n",
" <td>449</td>\n",
" <td>2439658</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/scarves/fabseasons/fabs...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>Ed Hardy Men Black Embellished Belt</td>\n",
" <td>accessories</td>\n",
" <td>Ed Hardy Men Black Embellished Belt, Ed Hardy...</td>\n",
" <td>Leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Width: 3.7 cm</td>\n",
" <td>1199</td>\n",
" <td>2238752</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/belts/ed-hardy/ed-hardy...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>Roadster Men Tan Brown Leather Belt</td>\n",
" <td>accessories</td>\n",
" <td>Roadster Men Tan Brown Leather Belt, Roadster,...</td>\n",
" <td>Leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Width: 4 cm</td>\n",
" <td>419</td>\n",
" <td>2975974</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/belts/roadster/roadster...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>Peora Silver-Toned Rhodium-Plated Stone-Studde...</td>\n",
" <td>accessories</td>\n",
" <td>Peora Silver Toned Rhodium Plated Stone Studde...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>551</td>\n",
" <td>3006095</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/ring/peora/peora-silver...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>Royal Enfield Unisex White Urban Trooper Helme...</td>\n",
" <td>accessories</td>\n",
" <td>Royal Enfield Unisex White Urban Trooper Helme...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>3500</td>\n",
" <td>2242802</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/helmets/royal-enfield/r...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>BuckleUp Men Black Leather Belt</td>\n",
" <td>accessories</td>\n",
" <td>BuckleUp Men Black Leather Belt, BuckleUp, Bel...</td>\n",
" <td>Leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Width: 3.5 cm</td>\n",
" <td>517</td>\n",
" <td>1734718</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/belts/buckleup/buckleup...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NAME CATEGORY \\\n",
"SERIAL NO \n",
"1 U.S. Polo Assn. Men Brown Genuine Leather Two ... accessories \n",
"2 Baggit Men Black Solid Two Fold Wallet accessories \n",
"3 HRX by Hrithik Roshan Men Grey Solid Baseball Cap accessories \n",
"4 Puma Unisex Grey Style Military Solid Baseball... accessories \n",
"5 FabSeasons Beige Solid Scarf accessories \n",
"6 Ed Hardy Men Black Embellished Belt accessories \n",
"7 Roadster Men Tan Brown Leather Belt accessories \n",
"8 Peora Silver-Toned Rhodium-Plated Stone-Studde... accessories \n",
"9 Royal Enfield Unisex White Urban Trooper Helme... accessories \n",
"10 BuckleUp Men Black Leather Belt accessories \n",
"\n",
" DESCRIPTION & COLOR \\\n",
"SERIAL NO \n",
"1 U.S. Polo Assn. Men Brown Genuine Leather Two ... \n",
"2 Baggit Men Black Solid Two Fold Wallet, Baggi... \n",
"3 HRX By Hrithik Roshan Men Grey Solid Baseball ... \n",
"4 Puma Unisex Grey Style Military Solid Baseball... \n",
"5 FabSeasons Beige Solid Scarf, FabSeasons, Scar... \n",
"6 Ed Hardy Men Black Embellished Belt, Ed Hardy... \n",
"7 Roadster Men Tan Brown Leather Belt, Roadster,... \n",
"8 Peora Silver Toned Rhodium Plated Stone Studde... \n",
"9 Royal Enfield Unisex White Urban Trooper Helme... \n",
"10 BuckleUp Men Black Leather Belt, BuckleUp, Bel... \n",
"\n",
" FABRIC \\\n",
"SERIAL NO \n",
"1 Genuine leather \n",
"2 PU \n",
"3 NaN \n",
"4 NaN \n",
"5 Acrylic \n",
"6 Leather \n",
"7 Leather \n",
"8 NaN \n",
"9 NaN \n",
"10 Leather \n",
"\n",
" IMAGE SIZE \\\n",
"SERIAL NO \n",
"1 https://assets.myntassets.com/h_1440,q_100,w_1... Height: 11.5 cm \n",
"2 https://assets.myntassets.com/h_1440,q_100,w_1... Height: \n",
"3 https://assets.myntassets.com/h_1440,q_100,w_1... NaN \n",
"4 https://assets.myntassets.com/h_1440,q_100,w_1... NaN \n",
"5 https://assets.myntassets.com/h_1440,q_100,w_1... Length:0.9 m \n",
"6 https://assets.myntassets.com/h_1440,q_100,w_1... Width: 3.7 cm \n",
"7 https://assets.myntassets.com/h_1440,q_100,w_1... Width: 4 cm \n",
"8 https://assets.myntassets.com/h_1440,q_100,w_1... NaN \n",
"9 https://assets.myntassets.com/h_1440,q_100,w_1... NaN \n",
"10 https://assets.myntassets.com/h_1440,q_100,w_1... Width: 3.5 cm \n",
"\n",
" PRICE PRODUCT ID WEBSITE \\\n",
"SERIAL NO \n",
"1 809 1943420 Myntra \n",
"2 720 4608404 Myntra \n",
"3 279 2178513 Myntra \n",
"4 499 6699035 Myntra \n",
"5 449 2439658 Myntra \n",
"6 1199 2238752 Myntra \n",
"7 419 2975974 Myntra \n",
"8 551 3006095 Myntra \n",
"9 3500 2242802 Myntra \n",
"10 517 1734718 Myntra \n",
"\n",
" PRODUCT URL \n",
"SERIAL NO \n",
"1 https://www.myntra.com/wallets/us-polo-assn/us... \n",
"2 https://www.myntra.com/wallets/baggit/baggit-m... \n",
"3 https://www.myntra.com/caps/hrx-by-hrithik-ros... \n",
"4 https://www.myntra.com/caps/puma/puma-unisex-g... \n",
"5 https://www.myntra.com/scarves/fabseasons/fabs... \n",
"6 https://www.myntra.com/belts/ed-hardy/ed-hardy... \n",
"7 https://www.myntra.com/belts/roadster/roadster... \n",
"8 https://www.myntra.com/ring/peora/peora-silver... \n",
"9 https://www.myntra.com/helmets/royal-enfield/r... \n",
"10 https://www.myntra.com/belts/buckleup/buckleup... "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"del data['FABRIC']\n",
"del data['IMAGE']\n",
"del data['SIZE']\n",
"del data['WEBSITE']\n",
"del data['PRODUCT URL']\n",
"del data['PRICE']\n",
"del data['PRODUCT ID']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data.rename(columns = {'DESCRIPTION & COLOR':'DESCRIPTION'}, inplace = True) "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data1 = data.sample(10000, random_state=1).copy()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Rename kategori produk\n",
"data1.replace({'CATEGORY': \n",
" {'accessories': 'Accesories', \n",
" 'casual-shirts': 'Casual Shirts',\n",
" 'Men-Casual-Trousers': 'Men Casual Trousers',\n",
" 'formal-shirts': 'Formal Shirts',\n",
" 'Men-Formal-Trousers': 'Men Formal Trousers',\n",
" 'men-jackets-coats': 'Men Jackets Coats',\n",
" 'men-swimwear': 'Men Swimwear',\n",
" 'men-suits': 'Men Suits'}}, \n",
" inplace= True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NAME</th>\n",
" <th>CATEGORY</th>\n",
" <th>DESCRIPTION</th>\n",
" </tr>\n",
" <tr>\n",
" <th>SERIAL NO</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>25553</td>\n",
" <td>Fort Collins Men Red Solid Padded Jacket</td>\n",
" <td>Men Jackets Coats</td>\n",
" <td>Fort Collins Men Red Solid Padded Jacket, For...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18640</td>\n",
" <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
" <td>Men Formal Trousers</td>\n",
" <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18543</td>\n",
" <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
" <td>Men Formal Trousers</td>\n",
" <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>21475</td>\n",
" <td>Hanes Charcoal Grey Thermal T-Shirt</td>\n",
" <td>Innerwear &amp; Sleapwear</td>\n",
" <td>Hanes Charcoal Grey Thermal T Shirt, Hanes, T...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14859</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Shirt</td>\n",
" <td>Formal Shirts</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Sh...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NAME \\\n",
"SERIAL NO \n",
"25553 Fort Collins Men Red Solid Padded Jacket \n",
"18640 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... \n",
"18543 Arrow Men Navy Blue Tapered Fit Checked Formal... \n",
"21475 Hanes Charcoal Grey Thermal T-Shirt \n",
"14859 Hancock Men Blue Regular Fit Striped Formal Shirt \n",
"\n",
" CATEGORY \\\n",
"SERIAL NO \n",
"25553 Men Jackets Coats \n",
"18640 Men Formal Trousers \n",
"18543 Men Formal Trousers \n",
"21475 Innerwear & Sleapwear \n",
"14859 Formal Shirts \n",
"\n",
" DESCRIPTION \n",
"SERIAL NO \n",
"25553 Fort Collins Men Red Solid Padded Jacket, For... \n",
"18640 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... \n",
"18543 Arrow Men Navy Blue Tapered Fit Checked Formal... \n",
"21475 Hanes Charcoal Grey Thermal T Shirt, Hanes, T... \n",
"14859 Hancock Men Blue Regular Fit Striped Formal Sh... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data1.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"text_total = data1.DESCRIPTION \n",
"text_total = text_total.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 Fort Collins Men Red Solid Padded Jacket, For...\n",
"1 MANGO MAN Men Navy Blue Tailored Slim Fit Soli...\n",
"2 Arrow Men Navy Blue Tapered Fit Checked Formal...\n",
"3 Hanes Charcoal Grey Thermal T Shirt, Hanes, T...\n",
"4 Hancock Men Blue Regular Fit Striped Formal Sh...\n",
"Name: DESCRIPTION, dtype: object\n"
]
}
],
"source": [
"print(text_total.head())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:17: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
" from collections import Mapping, defaultdict\n"
]
}
],
"source": [
"import re\n",
"from nltk.corpus import stopwords\n",
"import pandas as pd\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import sent_tokenize, word_tokenize\n",
"\n",
"def review_to_words(raw_text):\n",
"\n",
" # keep only words\n",
" letters_only_text = re.sub(\"[^a-zA-Z]\", \" \", raw_text)\n",
"\n",
" # convert to lower case and split \n",
" words = letters_only_text.lower().split()\n",
"\n",
" # remove stopwords\n",
" stopword_set = set(stopwords.words(\"english\"))\n",
" meaningful_words = [w for w in words if w not in stopword_set]\n",
" \n",
" #stemmed words\n",
" ps = PorterStemmer()\n",
" stemmed_words = [ps.stem(word) for word in meaningful_words]\n",
" \n",
" #join the cleaned words in a list\n",
" cleaned_word_list = \" \".join(stemmed_words)\n",
" \n",
" return cleaned_word_list"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['fort collin men red solid pad jacket fort collin jacket topwear apparel apparel men buy fort collin men red solid pad jacket onlin india buy jacket best price', 'mango man men navi blue tailor slim fit solid formal trouser mango man trouser bottomwear apparel apparel men buy mango man men navi blue tailor slim fit solid formal trouser onlin india buy trouser best price', 'arrow men navi blue taper fit check formal trouser arrow trouser bottomwear apparel apparel men buy arrow men navi blue taper fit check formal trouser onlin india buy trouser best price', 'hane charcoal grey thermal shirt hane thermal top innerwear apparel apparel men buy hane charcoal grey thermal shirt onlin india buy thermal top best price', 'hancock men blue regular fit stripe formal shirt hancock shirt topwear apparel apparel men buy hancock men blue regular fit stripe formal shirt onlin india buy shirt best price']\n"
]
}
],
"source": [
"# apply it to our text data \n",
"# dataset is named wine_data and the text are in the column \"wmn\"\n",
"processed_wmn = [review_to_words(str(text)) for text in text_total]\n",
"print(processed_wmn[:5])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"processed_words = \" \".join(processed_wmn).split()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def transform_sentences_to_pair_of_words(sentences):\n",
" list_of_pair_of_words = []\n",
" for i in range(len(sentences)):\n",
" buffer_pair_of_words = (sentences[i-1],sentences[i])\n",
" list_of_pair_of_words.append(buffer_pair_of_words)\n",
" del list_of_pair_of_words[0]\n",
" return list_of_pair_of_words"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"sentences_to_pair = transform_sentences_to_pair_of_words(processed_words)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('fort', 'collin'),\n",
" ('collin', 'men'),\n",
" ('men', 'red'),\n",
" ('red', 'solid'),\n",
" ('solid', 'pad'),\n",
" ('pad', 'jacket'),\n",
" ('jacket', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'jacket'),\n",
" ('jacket', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'men'),\n",
" ('men', 'red'),\n",
" ('red', 'solid'),\n",
" ('solid', 'pad'),\n",
" ('pad', 'jacket'),\n",
" ('jacket', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jacket'),\n",
" ('jacket', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'mango'),\n",
" ('mango', 'man'),\n",
" ('man', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'tailor'),\n",
" ('tailor', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'mango'),\n",
" ('mango', 'man'),\n",
" ('man', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'mango'),\n",
" ('mango', 'man'),\n",
" ('man', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'tailor'),\n",
" ('tailor', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'arrow'),\n",
" ('arrow', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'taper'),\n",
" ('taper', 'fit'),\n",
" ('fit', 'check'),\n",
" ('check', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'arrow'),\n",
" ('arrow', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'arrow'),\n",
" ('arrow', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'taper'),\n",
" ('taper', 'fit'),\n",
" ('fit', 'check'),\n",
" ('check', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'hane'),\n",
" ('hane', 'charcoal'),\n",
" ('charcoal', 'grey'),\n",
" ('grey', 'thermal'),\n",
" ('thermal', 'shirt'),\n",
" ('shirt', 'hane'),\n",
" ('hane', 'thermal'),\n",
" ('thermal', 'top'),\n",
" ('top', 'innerwear'),\n",
" ('innerwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'hane'),\n",
" ('hane', 'charcoal'),\n",
" ('charcoal', 'grey'),\n",
" ('grey', 'thermal'),\n",
" ('thermal', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'thermal'),\n",
" ('thermal', 'top'),\n",
" ('top', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'hancock'),\n",
" ('hancock', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'stripe'),\n",
" ('stripe', 'formal'),\n",
" ('formal', 'shirt'),\n",
" ('shirt', 'hancock'),\n",
" ('hancock', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'hancock'),\n",
" ('hancock', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'stripe'),\n",
" ('stripe', 'formal'),\n",
" ('formal', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'tantra'),\n",
" ('tantra', 'men'),\n",
" ('men', 'black'),\n",
" ('black', 'print'),\n",
" ('print', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'tantra'),\n",
" ('tantra', 'tshirt'),\n",
" ('tshirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'tantra'),\n",
" ('tantra', 'men'),\n",
" ('men', 'black'),\n",
" ('black', 'print'),\n",
" ('print', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'tshirt'),\n",
" ('tshirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'aeropostal'),\n",
" ('aeropostal', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'mildli'),\n",
" ('mildli', 'distress'),\n",
" ('distress', 'jean'),\n",
" ('jean', 'aeropostal'),\n",
" ('aeropostal', 'jean'),\n",
" ('jean', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'aeropostal'),\n",
" ('aeropostal', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'mildli'),\n",
" ('mildli', 'distress'),\n",
" ('distress', 'jean'),\n",
" ('jean', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jean'),\n",
" ('jean', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'ether'),\n",
" ('ether', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'anti'),\n",
" ('anti', 'microbi'),\n",
" ('microbi', 'cotton'),\n",
" ('cotton', 'linen'),\n",
" ('linen', 'shirt'),\n",
" ('shirt', 'ether'),\n",
" ('ether', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'ether'),\n",
" ('ether', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'anti'),\n",
" ('anti', 'microbi'),\n",
" ('microbi', 'cotton'),\n",
" ('cotton', 'linen'),\n",
" ('linen', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'roadster'),\n",
" ('roadster', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'clean'),\n",
" ('clean', 'look'),\n",
" ('look', 'jean'),\n",
" ('jean', 'roadster'),\n",
" ('roadster', 'jean'),\n",
" ('jean', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'roadster'),\n",
" ('roadster', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'clean'),\n",
" ('clean', 'look'),\n",
" ('look', 'jean'),\n",
" ('jean', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jean'),\n",
" ('jean', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'dollar'),\n",
" ('dollar', 'bigboss'),\n",
" ('bigboss', 'pack'),\n",
" ('pack', 'trunk'),\n",
" ('trunk', 'mdtr'),\n",
" ('mdtr', 'po'),\n",
" ('po', 'dollar'),\n",
" ('dollar', 'bigboss'),\n",
" ('bigboss', 'trunk'),\n",
" ('trunk', 'innerwear'),\n",
" ('innerwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'dollar'),\n",
" ('dollar', 'bigboss'),\n",
" ('bigboss', 'pack'),\n",
" ('pack', 'trunk'),\n",
" ('trunk', 'mdtr'),\n",
" ('mdtr', 'po'),\n",
" ('po', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trunk'),\n",
" ('trunk', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'moda'),\n",
" ('moda', 'rapido'),\n",
" ('rapido', 'men'),\n",
" ('men', 'black'),\n",
" ('black', 'print'),\n",
" ('print', 'polo'),\n",
" ('polo', 'collar'),\n",
" ('collar', 'shirt'),\n",
" ('shirt', 'moda'),\n",
" ('moda', 'rapido'),\n",
" ('rapido', 'tshirt'),\n",
" ('tshirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'moda'),\n",
" ('moda', 'rapido'),\n",
" ('rapido', 'men'),\n",
" ('men', 'black'),\n",
" ('black', 'print'),\n",
" ('print', 'polo'),\n",
" ('polo', 'collar'),\n",
" ('collar', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'tshirt'),\n",
" ('tshirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'loui'),\n",
" ('loui', 'philipp'),\n",
" ('philipp', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'self'),\n",
" ('self', 'design'),\n",
" ('design', 'formal'),\n",
" ('formal', 'shirt'),\n",
" ('shirt', 'loui'),\n",
" ('loui', 'philipp'),\n",
" ('philipp', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'loui'),\n",
" ('loui', 'philipp'),\n",
" ('philipp', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'self'),\n",
" ('self', 'design'),\n",
" ('design', 'formal'),\n",
" ('formal', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'light'),\n",
" ('light', 'blue'),\n",
" ('blue', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'skinni'),\n",
" ('skinni', 'fit'),\n",
" ('fit', 'jean'),\n",
" ('jean', 'calvin'),\n",
" ('calvin', 'klein'),\n",
" ('klein', 'jean'),\n",
" ('jean', 'jean'),\n",
" ('jean', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'light'),\n",
" ('light', 'blue'),\n",
" ('blue', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'skinni'),\n",
" ('skinni', 'fit'),\n",
" ('fit', 'jean'),\n",
" ('jean', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jean'),\n",
" ('jean', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'highland'),\n",
" ('highland', 'men'),\n",
" ('men', 'oliv'),\n",
" ('oliv', 'green'),\n",
" ('green', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'camouflag'),\n",
" ('camouflag', 'print'),\n",
" ('print', 'casual'),\n",
" ('casual', 'shirt'),\n",
" ('shirt', 'highland'),\n",
" ('highland', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'highland'),\n",
" ('highland', 'men'),\n",
" ('men', 'oliv'),\n",
" ('oliv', 'green'),\n",
" ('green', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'camouflag'),\n",
" ('camouflag', 'print'),\n",
" ('print', 'casual'),\n",
" ('casual', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'u'),\n",
" ('u', 'polo'),\n",
" ('polo', 'assn'),\n",
" ('assn', 'denim'),\n",
" ('denim', 'co'),\n",
" ('co', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'print'),\n",
" ('print', 'casual'),\n",
" ('casual', 'shirt'),\n",
" ('shirt', 'u'),\n",
" ('u', 'polo'),\n",
" ('polo', 'assn'),\n",
" ('assn', 'denim'),\n",
" ('denim', 'co'),\n",
" ('co', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'u'),\n",
" ('u', 'polo'),\n",
" ('polo', 'assn'),\n",
" ('assn', 'denim'),\n",
" ('denim', 'co'),\n",
" ('co', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'print'),\n",
" ('print', 'casual'),\n",
" ('casual', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'levi'),\n",
" ('levi', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'casual'),\n",
" ('casual', 'shirt'),\n",
" ('shirt', 'levi'),\n",
" ('levi', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'levi'),\n",
" ('levi', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'casual'),\n",
" ('casual', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'loui'),\n",
" ('loui', 'philipp'),\n",
" ('philipp', 'sport'),\n",
" ('sport', 'men'),\n",
" ('men', 'charcoal'),\n",
" ('charcoal', 'grey'),\n",
" ('grey', 'solid'),\n",
" ('solid', 'tailor'),\n",
" ('tailor', 'jacket'),\n",
" ('jacket', 'loui'),\n",
" ('loui', 'philipp'),\n",
" ('philipp', 'sport'),\n",
" ('sport', 'jacket'),\n",
" ('jacket', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'loui'),\n",
" ('loui', 'philipp'),\n",
" ('philipp', 'sport'),\n",
" ('sport', 'men'),\n",
" ('men', 'charcoal'),\n",
" ('charcoal', 'grey'),\n",
" ('grey', 'solid'),\n",
" ('solid', 'tailor'),\n",
" ('tailor', 'jacket'),\n",
" ('jacket', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jacket'),\n",
" ('jacket', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'killer'),\n",
" ('killer', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'clean'),\n",
" ('clean', 'look'),\n",
" ('look', 'jean'),\n",
" ('jean', 'killer'),\n",
" ('killer', 'jean'),\n",
" ('jean', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'killer'),\n",
" ('killer', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'clean'),\n",
" ('clean', 'look'),\n",
" ('look', 'jean'),\n",
" ('jean', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jean'),\n",
" ('jean', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'peter'),\n",
" ('peter', 'england'),\n",
" ('england', 'casual'),\n",
" ('casual', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'regular'),\n",
" ('regular', 'trouser'),\n",
" ('trouser', 'peter'),\n",
" ('peter', 'england'),\n",
" ('england', 'casual'),\n",
" ('casual', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'peter'),\n",
" ('peter', 'england'),\n",
" ('england', 'casual'),\n",
" ('casual', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'regular'),\n",
" ('regular', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'arrow'),\n",
" ('arrow', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'taper'),\n",
" ('taper', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'arrow'),\n",
" ('arrow', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'arrow'),\n",
" ('arrow', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'taper'),\n",
" ('taper', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'v'),\n",
" ('v', 'dot'),\n",
" ('dot', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'self'),\n",
" ('self', 'design'),\n",
" ('design', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'v'),\n",
" ('v', 'dot'),\n",
" ('dot', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'v'),\n",
" ('v', 'dot'),\n",
" ('dot', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'self'),\n",
" ('self', 'design'),\n",
" ('design', 'formal'),\n",
" ('formal', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'gespo'),\n",
" ('gespo', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'print'),\n",
" ('print', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'gespo'),\n",
" ('gespo', 'tshirt'),\n",
" ('tshirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'gespo'),\n",
" ('gespo', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'print'),\n",
" ('print', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'tshirt'),\n",
" ('tshirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'smag'),\n",
" ('smag', 'men'),\n",
" ('men', 'mustard'),\n",
" ('mustard', 'solid'),\n",
" ('solid', 'lightweight'),\n",
" ('lightweight', 'tailor'),\n",
" ('tailor', 'jacket'),\n",
" ('jacket', 'smag'),\n",
" ('smag', 'jacket'),\n",
" ('jacket', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'smag'),\n",
" ('smag', 'men'),\n",
" ('men', 'mustard'),\n",
" ('mustard', 'solid'),\n",
" ('solid', 'lightweight'),\n",
" ('lightweight', 'tailor'),\n",
" ('tailor', 'jacket'),\n",
" ('jacket', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jacket'),\n",
" ('jacket', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'jack'),\n",
" ('jack', 'jone'),\n",
" ('jone', 'men'),\n",
" ('men', 'black'),\n",
" ('black', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'regular'),\n",
" ('regular', 'trouser'),\n",
" ('trouser', 'jack'),\n",
" ('jack', 'jone'),\n",
" ('jone', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'jack'),\n",
" ('jack', 'jone'),\n",
" ('jone', 'men'),\n",
" ('men', 'black'),\n",
" ('black', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'regular'),\n",
" ('regular', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'van'),\n",
" ('van', 'heusen'),\n",
" ('heusen', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'formal'),\n",
" ('formal', 'shirt'),\n",
" ('shirt', 'van'),\n",
" ('van', 'heusen'),\n",
" ('heusen', 'shirt'),\n",
" ('shirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'van'),\n",
" ('van', 'heusen'),\n",
" ('heusen', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'regular'),\n",
" ('regular', 'fit'),\n",
" ('fit', 'solid'),\n",
" ('solid', 'formal'),\n",
" ('formal', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'shirt'),\n",
" ('shirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'maniac'),\n",
" ('maniac', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'solid'),\n",
" ('solid', 'v'),\n",
" ('v', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'maniac'),\n",
" ('maniac', 'tshirt'),\n",
" ('tshirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'maniac'),\n",
" ('maniac', 'men'),\n",
" ('men', 'grey'),\n",
" ('grey', 'solid'),\n",
" ('solid', 'v'),\n",
" ('v', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'tshirt'),\n",
" ('tshirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'clean'),\n",
" ('clean', 'look'),\n",
" ('look', 'stretchabl'),\n",
" ('stretchabl', 'crop'),\n",
" ('crop', 'jean'),\n",
" ('jean', 'jean'),\n",
" ('jean', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'men'),\n",
" ('men', 'blue'),\n",
" ('blue', 'slim'),\n",
" ('slim', 'fit'),\n",
" ('fit', 'mid'),\n",
" ('mid', 'rise'),\n",
" ('rise', 'clean'),\n",
" ('clean', 'look'),\n",
" ('look', 'stretchabl'),\n",
" ('stretchabl', 'crop'),\n",
" ('crop', 'jean'),\n",
" ('jean', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jean'),\n",
" ('jean', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'blackberri'),\n",
" ('blackberri', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'print'),\n",
" ('print', 'casual'),\n",
" ('casual', 'trouser'),\n",
" ('trouser', 'blackberri'),\n",
" ('blackberri', 'trouser'),\n",
" ('trouser', 'bottomwear'),\n",
" ('bottomwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'blackberri'),\n",
" ('blackberri', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'blue'),\n",
" ('blue', 'print'),\n",
" ('print', 'casual'),\n",
" ('casual', 'trouser'),\n",
" ('trouser', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'trouser'),\n",
" ('trouser', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'moda'),\n",
" ('moda', 'rapido'),\n",
" ('rapido', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'print'),\n",
" ('print', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'longlin'),\n",
" ('longlin', 'shirt'),\n",
" ('shirt', 'moda'),\n",
" ('moda', 'rapido'),\n",
" ('rapido', 'tshirt'),\n",
" ('tshirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'moda'),\n",
" ('moda', 'rapido'),\n",
" ('rapido', 'men'),\n",
" ('men', 'white'),\n",
" ('white', 'print'),\n",
" ('print', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'longlin'),\n",
" ('longlin', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'tshirt'),\n",
" ('tshirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'men'),\n",
" ('men', 'tan'),\n",
" ('tan', 'brown'),\n",
" ('brown', 'solid'),\n",
" ('solid', 'biker'),\n",
" ('biker', 'jacket'),\n",
" ('jacket', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'jacket'),\n",
" ('jacket', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'men'),\n",
" ('men', 'tan'),\n",
" ('tan', 'brown'),\n",
" ('brown', 'solid'),\n",
" ('solid', 'biker'),\n",
" ('biker', 'jacket'),\n",
" ('jacket', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jacket'),\n",
" ('jacket', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'men'),\n",
" ('men', 'rust'),\n",
" ('rust', 'brown'),\n",
" ('brown', 'solid'),\n",
" ('solid', 'biker'),\n",
" ('biker', 'jacket'),\n",
" ('jacket', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'jacket'),\n",
" ('jacket', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'fort'),\n",
" ('fort', 'collin'),\n",
" ('collin', 'men'),\n",
" ('men', 'rust'),\n",
" ('rust', 'brown'),\n",
" ('brown', 'solid'),\n",
" ('solid', 'biker'),\n",
" ('biker', 'jacket'),\n",
" ('jacket', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'jacket'),\n",
" ('jacket', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'esprit'),\n",
" ('esprit', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'white'),\n",
" ('white', 'stripe'),\n",
" ('stripe', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'esprit'),\n",
" ('esprit', 'tshirt'),\n",
" ('tshirt', 'topwear'),\n",
" ('topwear', 'apparel'),\n",
" ('apparel', 'apparel'),\n",
" ('apparel', 'men'),\n",
" ('men', 'buy'),\n",
" ('buy', 'esprit'),\n",
" ('esprit', 'men'),\n",
" ('men', 'navi'),\n",
" ('navi', 'white'),\n",
" ('white', 'stripe'),\n",
" ('stripe', 'round'),\n",
" ('round', 'neck'),\n",
" ('neck', 'shirt'),\n",
" ('shirt', 'onlin'),\n",
" ('onlin', 'india'),\n",
" ('india', 'buy'),\n",
" ('buy', 'tshirt'),\n",
" ('tshirt', 'best'),\n",
" ('best', 'price'),\n",
" ('price', 'u'),\n",
" ('u', 'polo'),\n",
" ('polo', 'assn'),\n",
" ('assn', 'men'),\n",
" ('men', 'oliv'),\n",
" ('oliv', 'green'),\n",
" ('green', 'regular'),\n",
" ...]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentences_to_pair"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx\n",
"\n",
"G = nx.DiGraph()\n",
"G.add_edges_from(sentences_to_pair)\n",
"G = G.to_undirected()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'networkx.classes.graph.Graph'>\n"
]
}
],
"source": [
"print(type(G))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Computing transition probabilities: 100%|██████████| 1508/1508 [00:08<00:00, 183.21it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Saved\n"
]
}
],
"source": [
"from node2vec import Node2Vec\n",
"node2vec = Node2Vec(G, dimensions=20, walk_length=16, num_walks=100, workers=2) \n",
"model = node2vec.fit(window=10, min_count=1) \n",
"print('Model Saved')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"assassin\n",
"junior\n",
"retro\n",
"rubber\n",
"digit\n",
"transpar\n",
"caffein\n",
"collect\n",
"creed\n"
]
}
],
"source": [
"for node, _ in model.most_similar('black'):\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tshirt\n",
"signatur\n",
"stardust\n",
"homm\n",
"price\n",
"tantra\n",
"naresh\n",
"statement\n"
]
}
],
"source": [
"for node, _ in model.most_similar('men'):\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"price\n",
"ivoc\n",
"guess\n",
"excalibur\n",
"dazzio\n",
"oxolloxo\n",
"dilling\n",
"rasm\n",
"dennison\n"
]
}
],
"source": [
"for node, _ in model.most_similar('buy'):\n",
" # Show only players\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bottomwear\n",
"loungewear\n",
"enfield\n",
"apparel\n",
"royal\n",
"mumf\n",
"alvaro\n",
"underwear\n",
"robe\n"
]
}
],
"source": [
"w1 = \"best\"\n",
"for node, _ in model.wv.most_similar (positive=w1, topn=10):\n",
" # Show only players\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n",
" vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"black\n"
]
}
],
"source": [
"print(model.wv.doesnt_match(\"men black jeans\".split()))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('pro', 1.9804141521453857),\n",
" ('nsw', 1.8536335229873657),\n",
" ('brt', 1.6897633075714111),\n",
" ('thoroughbr', 1.641608715057373),\n",
" ('fcb', 1.6144654750823975),\n",
" ('scoop', 1.5819679498672485),\n",
" ('cncpt', 1.5131385326385498),\n",
" ('spanish', 1.4589388370513916),\n",
" ('caffein', 1.4484915733337402),\n",
" ('stad', 1.440345287322998)]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar_cosmul(positive=['woman', 'black'], negative=['man'])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.99999994"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.similarity(w1=\"black\", w2=\"black\")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.33650184"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.similarity(w1=\"black\", w2=\"brown\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'white'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.doesnt_match([\"brown\", \"white\", \"black\"])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.32085222"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.similarity(w1=\"black\", w2=\"white\")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.27853268"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.similarity(w1=\"white\", w2=\"brown\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"bottomwear\n",
"loungewear\n",
"enfield\n",
"apparel\n",
"royal\n",
"mumf\n",
"alvaro\n",
"underwear\n",
"robe\n"
]
}
],
"source": [
"for node, _ in model.most_similar('best'):\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8185"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"descriptions = data1[\"DESCRIPTION\"].unique().tolist()\n",
"len(descriptions)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"# shuffle customer ID's\n",
"random.shuffle(descriptions)\n",
"\n",
"# extract 90% of customer ID's\n",
"descriptions_train = [descriptions[i] for i in range(round(0.9*len(descriptions)))]\n",
"\n",
"# split data into train and validation set\n",
"train_df = data1[data1['DESCRIPTION'].isin(descriptions_train)]\n",
"validation_df = data1[~data1['DESCRIPTION'].isin(descriptions_train)]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 7366/7366 [00:16<00:00, 449.58it/s]\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"\n",
"# list to capture purchase history of the customers\n",
"products_train = []\n",
"\n",
"# populate the list with the product codes\n",
"for i in tqdm(descriptions_train):\n",
" temp = train_df[train_df[\"DESCRIPTION\"] == i][\"CATEGORY\"].tolist()\n",
" products_train.append(temp)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 819/819 [00:00<00:00, 893.95it/s]\n"
]
}
],
"source": [
"products_val = []\n",
"\n",
"# populate the list with the product codes\n",
"for i in tqdm(validation_df['DESCRIPTION'].unique()):\n",
" temp = validation_df[validation_df[\"DESCRIPTION\"] == i][\"CATEGORY\"].tolist()\n",
" products_val.append(temp)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" after removing the cwd from sys.path.\n"
]
}
],
"source": [
"products = data1[[\"CATEGORY\", \"NAME\"]]\n",
"\n",
"# remove duplicates\n",
"products.drop_duplicates(inplace=True, subset='CATEGORY', keep=\"last\")\n",
"\n",
"# create product-ID and product-description dictionary\n",
"products_dict = products.groupby('CATEGORY')['NAME'].apply(list).to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Navy Blue Checked Casual Jacket']"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# test the dictionary\n",
"products_dict['Men Jackets Coats']"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Wills Lifestyle Men Grey Regular Fit Woollen Solid Formal Trousers']"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_dict['Men Formal Trousers']"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"def similar_products(v, n = 6):\n",
" \n",
" # extract most similar products for the input vector\n",
" ms = model.similar_by_vector(v, topn= n+1)[1:]\n",
" \n",
" # extract name and similarity score of the similar products\n",
" new_ms = []\n",
" for j in ms:\n",
" pair = (products_dict[j[0]][0], j[1])\n",
" new_ms.append(pair)\n",
" \n",
" return new_ms"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DeprecationWarning: Call to deprecated `similar_by_vector` (Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead).\n",
" after removing the cwd from sys.path.\n"
]
},
{
"ename": "KeyError",
"evalue": "'smag'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-44-b12c7e3e20d0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msimilar_products\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'fort'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-42-19f474f97eb7>\u001b[0m in \u001b[0;36msimilar_products\u001b[1;34m(v, n)\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mnew_ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mms\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mpair\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mproducts_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mj\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mj\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[0mnew_ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpair\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mKeyError\u001b[0m: 'smag'"
]
}
],
"source": [
"similar_products(model['fort'])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"def aggregate_vectors(products):\n",
" product_vec = []\n",
" for i in products:\n",
" try:\n",
" product_vec.append(model[i])\n",
" except KeyError:\n",
" continue\n",
" \n",
" return np.mean(product_vec, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(products_val[0])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" \"\"\"\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3118: RuntimeWarning: Mean of empty slice.\n",
" out=out, **kwargs)\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars\n",
" ret = ret.dtype.type(ret / rcount)\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DeprecationWarning: Call to deprecated `similar_by_vector` (Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead).\n",
" after removing the cwd from sys.path.\n"
]
},
{
"ename": "TypeError",
"evalue": "cannot unpack non-iterable numpy.float64 object",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-48-0643cd97cf20>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msimilar_products\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maggregate_vectors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mproducts_val\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-42-19f474f97eb7>\u001b[0m in \u001b[0;36msimilar_products\u001b[1;34m(v, n)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# extract most similar products for the input vector\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msimilar_by_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtopn\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mn\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m# extract name and similarity score of the similar products\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\utils.py\u001b[0m in \u001b[0;36mnew_func1\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 1445\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1446\u001b[0m )\n\u001b[1;32m-> 1447\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1448\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1449\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mnew_func1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\models\\base_any2vec.py\u001b[0m in \u001b[0;36msimilar_by_vector\u001b[1;34m(self, vector, topn, restrict_vocab)\u001b[0m\n\u001b[0;32m 1432\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1433\u001b[0m \"\"\"\n\u001b[1;32m-> 1434\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msimilar_by_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvector\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtopn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrestrict_vocab\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1435\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1436\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mdeprecated\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Method will be removed in 4.0.0, use self.wv.doesnt_match() instead\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36msimilar_by_vector\u001b[1;34m(self, vector, topn, restrict_vocab)\u001b[0m\n\u001b[0;32m 620\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 621\u001b[0m \"\"\"\n\u001b[1;32m--> 622\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmost_similar\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpositive\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mvector\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtopn\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtopn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrestrict_vocab\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrestrict_vocab\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 623\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 624\u001b[0m @deprecated(\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36mmost_similar\u001b[1;34m(self, positive, negative, topn, restrict_vocab, indexer)\u001b[0m\n\u001b[0;32m 547\u001b[0m \u001b[1;31m# compute the weighted average of all words\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 548\u001b[0m \u001b[0mall_words\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmean\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 549\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mword\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweight\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpositive\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mnegative\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 550\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndarray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 551\u001b[0m \u001b[0mmean\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweight\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: cannot unpack non-iterable numpy.float64 object"
]
}
],
"source": [
"similar_products(aggregate_vectors(products_val[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment