" <td>Hanes Charcoal Grey Thermal T Shirt, Hanes, T...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14859</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Shirt</td>\n",
" <td>Formal Shirts</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Sh...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NAME \\\n",
"SERIAL NO \n",
"25553 Fort Collins Men Red Solid Padded Jacket \n",
"18640 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... \n",
"18543 Arrow Men Navy Blue Tapered Fit Checked Formal... \n",
"21475 Hanes Charcoal Grey Thermal T-Shirt \n",
"14859 Hancock Men Blue Regular Fit Striped Formal Shirt \n",
"\n",
" CATEGORY \\\n",
"SERIAL NO \n",
"25553 Men Jackets Coats \n",
"18640 Men Formal Trousers \n",
"18543 Men Formal Trousers \n",
"21475 Innerwear & Sleapwear \n",
"14859 Formal Shirts \n",
"\n",
" DESCRIPTION \n",
"SERIAL NO \n",
"25553 Fort Collins Men Red Solid Padded Jacket, For... \n",
"18640 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... \n",
"18543 Arrow Men Navy Blue Tapered Fit Checked Formal... \n",
"21475 Hanes Charcoal Grey Thermal T Shirt, Hanes, T... \n",
"14859 Hancock Men Blue Regular Fit Striped Formal Sh... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data1.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"text_total = data1.DESCRIPTION \n",
"text_total = text_total.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 Fort Collins Men Red Solid Padded Jacket, For...\n",
"1 MANGO MAN Men Navy Blue Tailored Slim Fit Soli...\n",
"2 Arrow Men Navy Blue Tapered Fit Checked Formal...\n",
"3 Hanes Charcoal Grey Thermal T Shirt, Hanes, T...\n",
"4 Hancock Men Blue Regular Fit Striped Formal Sh...\n",
"Name: DESCRIPTION, dtype: object\n"
]
}
],
"source": [
"print(text_total.head())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:17: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
"['fort collin men red solid pad jacket fort collin jacket topwear apparel apparel men buy fort collin men red solid pad jacket onlin india buy jacket best price', 'mango man men navi blue tailor slim fit solid formal trouser mango man trouser bottomwear apparel apparel men buy mango man men navi blue tailor slim fit solid formal trouser onlin india buy trouser best price', 'arrow men navi blue taper fit check formal trouser arrow trouser bottomwear apparel apparel men buy arrow men navi blue taper fit check formal trouser onlin india buy trouser best price', 'hane charcoal grey thermal shirt hane thermal top innerwear apparel apparel men buy hane charcoal grey thermal shirt onlin india buy thermal top best price', 'hancock men blue regular fit stripe formal shirt hancock shirt topwear apparel apparel men buy hancock men blue regular fit stripe formal shirt onlin india buy shirt best price']\n"
]
}
],
"source": [
"# apply it to our text data \n",
"# dataset is named wine_data and the text are in the column \"wmn\"\n",
"processed_wmn = [review_to_words(str(text)) for text in text_total]\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"assassin\n",
"junior\n",
"retro\n",
"rubber\n",
"digit\n",
"transpar\n",
"caffein\n",
"collect\n",
"creed\n"
]
}
],
"source": [
"for node, _ in model.most_similar('black'):\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tshirt\n",
"signatur\n",
"stardust\n",
"homm\n",
"price\n",
"tantra\n",
"naresh\n",
"statement\n"
]
}
],
"source": [
"for node, _ in model.most_similar('men'):\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"price\n",
"ivoc\n",
"guess\n",
"excalibur\n",
"dazzio\n",
"oxolloxo\n",
"dilling\n",
"rasm\n",
"dennison\n"
]
}
],
"source": [
"for node, _ in model.most_similar('buy'):\n",
" # Show only players\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bottomwear\n",
"loungewear\n",
"enfield\n",
"apparel\n",
"royal\n",
"mumf\n",
"alvaro\n",
"underwear\n",
"robe\n"
]
}
],
"source": [
"w1 = \"best\"\n",
"for node, _ in model.wv.most_similar (positive=w1, topn=10):\n",
" # Show only players\n",
" if len(node) > 3:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n",
" vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"black\n"
]
}
],
"source": [
"print(model.wv.doesnt_match(\"men black jeans\".split()))"
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
"['Wills Lifestyle Men Grey Regular Fit Woollen Solid Formal Trousers']"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products_dict['Men Formal Trousers']"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"def similar_products(v, n = 6):\n",
" \n",
" # extract most similar products for the input vector\n",
" ms = model.similar_by_vector(v, topn= n+1)[1:]\n",
" \n",
" # extract name and similarity score of the similar products\n",
" new_ms = []\n",
" for j in ms:\n",
" pair = (products_dict[j[0]][0], j[1])\n",
" new_ms.append(pair)\n",
" \n",
" return new_ms"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DeprecationWarning: Call to deprecated `similar_by_vector` (Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead).\n",
"\u001b[1;32m<ipython-input-44-b12c7e3e20d0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msimilar_products\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'fort'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" \"\"\"\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3118: RuntimeWarning: Mean of empty slice.\n",
" out=out, **kwargs)\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars\n",
" ret = ret.dtype.type(ret / rcount)\n",
"C:\\Users\\Agusti Frananda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DeprecationWarning: Call to deprecated `similar_by_vector` (Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead).\n",
"\u001b[1;32m<ipython-input-48-0643cd97cf20>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msimilar_products\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maggregate_vectors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mproducts_val\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-42-19f474f97eb7>\u001b[0m in \u001b[0;36msimilar_products\u001b[1;34m(v, n)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# extract most similar products for the input vector\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msimilar_by_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtopn\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mn\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m# extract name and similarity score of the similar products\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",