{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Kelompok 3 | Search Engine with Inverted Index Simulator Based on Billboard Songs Collection\n",
    "    - 12S16003 Maria H. Siallagan\n",
    "    - 12S16026 Yolanda Nainggolan\n",
    "    - 12S16036 Prima Hutapea\n",
    "    - 12S16049 Rosa Delima Mendrofa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "import re\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import xml.dom.minidom as minidom\n",
    "dcmnt_xml = minidom.parse(\"dataset_STBI.xml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')\n",
    "all_profile = dcmnt_xml.getElementsByTagName('SONG')\n",
    "all_date = dcmnt_xml.getElementsByTagName('ARTIST')\n",
    "all_text = dcmnt_xml.getElementsByTagName('LYRICS')\n",
    "all_pub = dcmnt_xml.getElementsByTagName('PUB')\n",
    "all_page = dcmnt_xml.getElementsByTagName('PAGE')\n",
    "\n",
    "N_DOC_sample = len(all_doc_no)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_sentence_doc_sample = []\n",
    "for i in range(N_DOC_sample):\n",
    "    sentence_doc_sample = ' '+ all_text[i].firstChild.data\n",
    "    all_sentence_doc_sample.append(sentence_doc_sample)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocessing "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_doc = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_punc_tokenize(sentence):\n",
    "    tokens = []\n",
    "    for punctuation in string.punctuation:\n",
    "        sentence = sentence.replace(punctuation,\" \")\n",
    "    \n",
    "    sentence = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', sentence, flags=re.MULTILINE)\n",
    "    for w in CountVectorizer().build_tokenizer()(sentence):\n",
    "        tokens.append(w)\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(N_DOC):\n",
    "    tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words('english'))\n",
    "def stop_word_token(tokens):\n",
    "    tokens = [w for w in tokens if not w in stop_words]\n",
    "    return tokens\n",
    "\n",
    "for i in range(N_DOC):\n",
    "    tokens_doc[i] = stop_word_token(tokens_doc[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(N_DOC):\n",
    "    tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import PorterStemmer\n",
    "stemmer = PorterStemmer()\n",
    "def stemming(tokens):\n",
    "    for i in range(0, len(tokens)):\n",
    "        if (tokens[i] != stemmer.stem(tokens[i])):\n",
    "            tokens[i] = stemmer.stem(tokens[i])\n",
    "    return tokens\n",
    "\n",
    "\n",
    "for i in range(N_DOC):\n",
    "    tokens_doc[i] = stemming(tokens_doc[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_tokens = []\n",
    "for i in range(N_DOC):\n",
    "    for w in tokens_doc[i]:\n",
    "        all_tokens.append(w)\n",
    "\n",
    "new_sentence = ' '.join([w for w in all_tokens])\n",
    "\n",
    "for w in CountVectorizer().build_tokenizer()(new_sentence):\n",
    "    all_tokens.append(w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_tokens = set(all_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import count\n",
    "try: \n",
    "    from itertools import izip as zip\n",
    "except ImportError:\n",
    "    pass\n",
    "proximity_index = {}\n",
    "for token in all_tokens:\n",
    "    dict_doc_position = {}\n",
    "    for n in range(N_DOC):\n",
    "        if(token in tokens_doc[n]):\n",
    "            dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]\n",
    "    proximity_index[token] = dict_doc_position"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "proximity_index = collections.OrderedDict(sorted(proximity_index.items()))\n",
    "for key, value in proximity_index.items():\n",
    "    print (key, value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}