{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Kelompok 3 | Search Engine with Inverted Index Simulator Based on Billboard Songs Collection\n", " - 12S16003 Maria H. Siallagan\n", " - 12S16026 Yolanda Nainggolan\n", " - 12S16036 Prima Hutapea\n", " - 12S16049 Rosa Delima Mendrofa" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import string\n", "import re\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import xml.dom.minidom as minidom\n", "dcmnt_xml = minidom.parse(\"dataset_STBI.xml\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')\n", "all_profile = dcmnt_xml.getElementsByTagName('SONG')\n", "all_date = dcmnt_xml.getElementsByTagName('ARTIST')\n", "all_text = dcmnt_xml.getElementsByTagName('LYRICS')\n", "all_pub = dcmnt_xml.getElementsByTagName('PUB')\n", "all_page = dcmnt_xml.getElementsByTagName('PAGE')\n", "\n", "N_DOC_sample = len(all_doc_no)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "all_sentence_doc_sample = []\n", "for i in range(N_DOC_sample):\n", " sentence_doc_sample = ' '+ all_text[i].firstChild.data\n", " all_sentence_doc_sample.append(sentence_doc_sample)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "tokens_doc = []" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def remove_punc_tokenize(sentence):\n", " tokens = []\n", " for punctuation in string.punctuation:\n", " sentence = sentence.replace(punctuation,\" \")\n", " \n", " sentence = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', sentence, flags=re.MULTILINE)\n", " for w in CountVectorizer().build_tokenizer()(sentence):\n", " tokens.append(w)\n", " return tokens" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "for i in range(N_DOC):\n", " tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words('english'))\n", "def stop_word_token(tokens):\n", " tokens = [w for w in tokens if not w in stop_words]\n", " return tokens\n", "\n", "for i in range(N_DOC):\n", " tokens_doc[i] = stop_word_token(tokens_doc[i])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "for i in range(N_DOC):\n", " tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import PorterStemmer\n", "stemmer = PorterStemmer()\n", "def stemming(tokens):\n", " for i in range(0, len(tokens)):\n", " if (tokens[i] != stemmer.stem(tokens[i])):\n", " tokens[i] = stemmer.stem(tokens[i])\n", " return tokens\n", "\n", "\n", "for i in range(N_DOC):\n", " tokens_doc[i] = stemming(tokens_doc[i])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_tokens = []\n", "for i in range(N_DOC):\n", " for w in tokens_doc[i]:\n", " all_tokens.append(w)\n", "\n", "new_sentence = ' '.join([w for w in all_tokens])\n", "\n", "for w in CountVectorizer().build_tokenizer()(new_sentence):\n", " all_tokens.append(w)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_tokens = set(all_tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from itertools import count\n", "try: \n", " from itertools import izip as zip\n", "except ImportError:\n", " pass\n", "proximity_index = {}\n", "for token in all_tokens:\n", " dict_doc_position = {}\n", " for n in range(N_DOC):\n", " if(token in tokens_doc[n]):\n", " dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]\n", " proximity_index[token] = dict_doc_position" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import collections\n", "proximity_index = collections.OrderedDict(sorted(proximity_index.items()))\n", "for key, value in proximity_index.items():\n", " print (key, value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }