{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# <center> PRAPROSES DAN POS-TAGGING HMM-VITERBI </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Import Library NLTK & String"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import string"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Import Dataset Pernyataan Kebutuhan Perangkat Lunak"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sisfo ibadah haji\n",
      "Requirements/sisfo ibadah haji.txt\n"
     ]
    }
   ],
   "source": [
    "inputname = input()\n",
    "nameFile = \"Requirements/\"+inputname+\".txt\"\n",
    "print(nameFile)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Sentence Splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def openFile(nameFile):\n",
    "    all_req = []\n",
    "    req = open(nameFile, encoding=\"utf-8\")\n",
    "    data_req = req.readlines()\n",
    "    for i in range(len(data_req)):\n",
    "        all_req.append(data_req[i])\n",
    "    req.close()\n",
    "    return all_req"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "list index out of range",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-4-0af78ebdee88>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mall_req\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopenFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnameFile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mall_req\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m35\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mIndexError\u001b[0m: list index out of range"
     ]
    }
   ],
   "source": [
    "all_req = openFile(nameFile)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Tokenisasi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizationpreProcess = []\n",
    "for token in all_req:\n",
    "    tokenizationpreProcess.append(nltk.word_tokenize(token))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Remove Punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenization = []\n",
    "punct = ['.','!','?',':']\n",
    "for requirements in tokenizationpreProcess:\n",
    "    tokenizationKalimat1=[req for req in requirements if req not in punct]\n",
    "    tokenization.append(tokenizationKalimat1)\n",
    "print(tokenization[35])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Karakteristik Dataset (opsional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bagofWords(req, wordCount):\n",
    "    if req in wordCount:\n",
    "        wordCount[req] += 1\n",
    "    else:\n",
    "        wordCount[req] = 1\n",
    "    return wordCount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wordCount = {}\n",
    "totalWord = 1\n",
    "for reqs in tokenization:\n",
    "    for req in reqs:\n",
    "        bagofWords(req,wordCount)\n",
    "        totalWord +=1\n",
    "print(wordCount)\n",
    "print(totalWord)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. POSTAGGING HMM-VITERBI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def frekuensi(korpus):\n",
    "    tagCount = {}\n",
    "    tagCount['<start>'] = 0\n",
    "    wordTag={}\n",
    "    tagTrans={}\n",
    "    idxLine=0\n",
    "    firstWord=0\n",
    "    \n",
    "    while idxLine < len(korpus):\n",
    "        prevTag = '<start>'\n",
    "        while not korpus[idxLine].startswith('</kalimat'):\n",
    "            if not korpus[idxLine].startswith('<kalimat'):\n",
    "                korpusPart = korpus[idxLine].split('\\t')\n",
    "                if korpusPart[1] in tagCount:\n",
    "                    tagCount[korpusPart[1]] += 1\n",
    "                else:\n",
    "                    tagCount[korpusPart[1]] = 1\n",
    "                    \n",
    "                if(korpusPart[0],korpusPart[1]) in wordTag:\n",
    "                    wordTag[(korpusPart[0], korpusPart[1])] +=1\n",
    "                else:\n",
    "                    wordTag[(korpusPart[0],korpusPart[1])] = 1\n",
    "                \n",
    "                if firstWord == 1:\n",
    "                    currentTagTrans=('<start>',korpusPart[1])\n",
    "                    firstWord = 0\n",
    "                else:\n",
    "                    currentTagTrans=(prevTag,korpusPart[1])\n",
    "                \n",
    "                if currentTagTrans in tagTrans:\n",
    "                    tagTrans[currentTagTrans] += 1\n",
    "                else:\n",
    "                    tagTrans[currentTagTrans] = 1\n",
    "                prevTag = korpusPart[1]\n",
    "            \n",
    "            else:\n",
    "                tagCount['<start>'] += 1\n",
    "                firstWord = 1\n",
    "            \n",
    "            idxLine = idxLine+1\n",
    "        \n",
    "        idxLine = idxLine + 1\n",
    "        \n",
    "    return tagCount, wordTag, tagTrans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dictProbTransition(tagTrans, tagCount):\n",
    "    probabilityTrans = {}\n",
    "    keys = tagTrans.keys()\n",
    "    for tag in keys:\n",
    "        if (tag[0], tag[1]) in tagTrans:\n",
    "            probabilityTrans[(tag[0],tag[1])] = tagTrans[(tag[0], tag[1])]/tagCount[tag[0]]\n",
    "    return probabilityTrans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dictProbEmission(wordTag, tagCount):\n",
    "    probabilityEmission = {}\n",
    "    wordKey = []\n",
    "    keys = wordTag.keys()\n",
    "    for part in keys:\n",
    "        if((part[0],part[1])) in wordTag:\n",
    "            probabilityEmission[(part[0],part[1])] = wordTag[(part[0],part[1])]/tagCount[(part[1])]\n",
    "    return probabilityEmission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('Hasil Tag\\Tag Korpus anotasi.txt', encoding='utf-8') as file:\n",
    "    korpus = file.readlines()\n",
    "korpus = [req.strip() for req in korpus]\n",
    "tagCount, wordTag, tagTrans = frekuensi(korpus)\n",
    "print(wordTag)\n",
    "# # print(tagCount)\n",
    "# # print(tagTrans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "probabilityTrans = dictProbTransition(tagTrans,tagCount)\n",
    "probabilityEmission = dictProbEmission(wordTag, tagCount)\n",
    "\n",
    "print(probabilityTrans)\n",
    "# print(probabilityEmission)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelTag = {}\n",
    "modelKata = []\n",
    "for reqs in wordTag:\n",
    "    if reqs[1] in modelTag:\n",
    "        modelTag[reqs[1]] += 1\n",
    "    else:\n",
    "        modelTag[reqs[1]] = 1\n",
    "    if reqs[0] not in modelKata:\n",
    "        modelKata.append(reqs[0])\n",
    "tagMax = max(modelTag, key=lambda reqs:modelTag[reqs])\n",
    "print(modelKata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def findTag(probabilityEmission, reqWords):\n",
    "    tags = []\n",
    "    for word in probabilityEmission:\n",
    "        if reqWords == word[0]:\n",
    "            tags.append(word[1])\n",
    "    return tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def viterbi(probabilityTransition, probabilityEmission, reqWords, tagCount, tagMax, modelKata):\n",
    "    tagSequence = {}\n",
    "    viterbiMat = {}\n",
    "    emissionTable = {}\n",
    "    transitionTable = {}\n",
    "    tagArray = []\n",
    "    temp = {}\n",
    "    \n",
    "    keysEmission = probabilityEmission.keys()\n",
    "    keysTrans = probabilityTransition.keys()\n",
    "    keysCount = tagCount.keys()\n",
    "    \n",
    "    for i in range(len(reqWords)):\n",
    "        if reqWords[i] not in modelKata:\n",
    "            modelKata.append(reqWords[i])\n",
    "            probabilityEmission[(reqWords[i],tagMax)] = 1/tagCount[tagMax]\n",
    "            \n",
    "        tagArray = list(set(tagArray+findTag(probabilityEmission,reqWords[i])))\n",
    "\n",
    "    for tag in tagArray:\n",
    "        if ('<start>',tag) in keysTrans:\n",
    "                transitionTable[('<start>',tag)] = probabilityTrans[('<start>',tag)]\n",
    "        else:\n",
    "            transitionTable[('<start>',tag)] = 0\n",
    "    \n",
    "    for tag1 in tagArray:\n",
    "        for tag2 in tagArray: \n",
    "            if (tag1, tag2) in keysTrans:\n",
    "                transitionTable[(tag1,tag2)] = probabilityTrans[(tag1,tag2)]\n",
    "            else:\n",
    "                transitionTable[(tag1,tag2)] = 0\n",
    "    \n",
    "    for req in reqWords:\n",
    "        for tag in tagArray:\n",
    "            if (req,tag) in probabilityEmission:\n",
    "                emissionTable[(req,tag)] = probabilityEmission[(req,tag)]\n",
    "            else:\n",
    "                emissionTable[(req,tag)] = 0\n",
    "    \n",
    "    for i in range(0,len(reqWords)):\n",
    "        if i==0:\n",
    "            for tag in tagArray:\n",
    "                viterbiMat[(reqWords[i],tag)] = emissionTable[(reqWords[i],tag)]*transitionTable[('<start>',tag)]\n",
    "                tagSequence[(reqWords[i],tag)] = tag\n",
    "        else:\n",
    "            for tag1 in tagArray:\n",
    "                newScore = {}\n",
    "                for tag2 in tagArray:\n",
    "                    newScore[(reqWords[i-1],tag2)] = viterbiMat[(reqWords[i-1],tag2)]*transitionTable[(tag2,tag1)]\n",
    "                max_path = max(newScore, key=lambda k: newScore[k])\n",
    "                viterbiMat[(reqWords[i],tag1)] = newScore[max_path] * emissionTable[(reqWords[i],tag1)]\n",
    "                tagSequence[(reqWords[i],tag1)] = max_path\n",
    "    \n",
    "    last_word = reqWords[len(reqWords)-1]\n",
    "    tagg = tagArray[0]\n",
    "    tags = [tagg]\n",
    "    maximum = viterbiMat[(last_word,tagg)]\n",
    "    for i in range(1,len(tagArray)):\n",
    "        if viterbiMat[(last_word,tagArray[i])] > maximum:\n",
    "            maximum = viterbiMat[(last_word,tagArray[i])]\n",
    "            tags = [tagArray[i]]\n",
    "    for i in range(len(reqWords)-2,-1,-1):\n",
    "        tags.insert(0,tagSequence[(reqWords[i+1],tags[0])][1])\n",
    "    \n",
    "    return tags  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Hasil Pemberian Tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "viterbiResult = []\n",
    "for reqWords in tokenization:\n",
    "    viterbiReqs = viterbi(probabilityTrans,probabilityEmission,reqWords, tagCount, tagMax, modelKata)\n",
    "    viterbiResult.append(viterbiReqs)\n",
    "\n",
    "for i in range(len(viterbiResult)):\n",
    "    print(i,viterbiResult[i])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Transformasi Data Hasil Tag ke Dokumen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "saveFile=\"Hasil Tag/Tag \"+inputname+\".txt\"\n",
    "with open(saveFile,\"w+\",encoding='utf-8') as file: \n",
    "    for i in range(len(tokenization)):\n",
    "        file.write(\"<kalimat id=\"+str(i)+\">\"+\"\\n\")\n",
    "        for j in range(len(tokenization[i])):\n",
    "            file.write(tokenization[i][j]+\"\\t\"+viterbiResult[i][j])\n",
    "            file.write(\"\\n\")\n",
    "        file.write(\"</kalimat id=\"+str(i)+\">\"+\"\\n\")\n",
    "file.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}