{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#
PRAPROSES DAN POS-TAGGING HMM-VITERBI
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Import Library NLTK & String" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import string" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Import Dataset Pernyataan Kebutuhan Perangkat Lunak" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sisfo ibadah haji\n", "Requirements/sisfo ibadah haji.txt\n" ] } ], "source": [ "inputname = input()\n", "nameFile = \"Requirements/\"+inputname+\".txt\"\n", "print(nameFile)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Sentence Splitting" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def openFile(nameFile):\n", " all_req = []\n", " req = open(nameFile, encoding=\"utf-8\")\n", " data_req = req.readlines()\n", " for i in range(len(data_req)):\n", " all_req.append(data_req[i])\n", " req.close()\n", " return all_req" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mall_req\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopenFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnameFile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mall_req\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m35\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mIndexError\u001b[0m: list index out of range" ] } ], "source": [ "all_req = openFile(nameFile)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Tokenisasi" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizationpreProcess = []\n", "for token in all_req:\n", " tokenizationpreProcess.append(nltk.word_tokenize(token))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Remove Punctuation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenization = []\n", "punct = ['.','!','?',':']\n", "for requirements in tokenizationpreProcess:\n", " tokenizationKalimat1=[req for req in requirements if req not in punct]\n", " tokenization.append(tokenizationKalimat1)\n", "print(tokenization[35])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Karakteristik Dataset (opsional)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bagofWords(req, wordCount):\n", " if req in wordCount:\n", " wordCount[req] += 1\n", " else:\n", " wordCount[req] = 1\n", " return wordCount" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wordCount = {}\n", "totalWord = 1\n", "for reqs in tokenization:\n", " for req in reqs:\n", " bagofWords(req,wordCount)\n", " totalWord +=1\n", "print(wordCount)\n", "print(totalWord)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. POSTAGGING HMM-VITERBI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def frekuensi(korpus):\n", " tagCount = {}\n", " tagCount[''] = 0\n", " wordTag={}\n", " tagTrans={}\n", " idxLine=0\n", " firstWord=0\n", " \n", " while idxLine < len(korpus):\n", " prevTag = ''\n", " while not korpus[idxLine].startswith('',korpusPart[1])\n", " firstWord = 0\n", " else:\n", " currentTagTrans=(prevTag,korpusPart[1])\n", " \n", " if currentTagTrans in tagTrans:\n", " tagTrans[currentTagTrans] += 1\n", " else:\n", " tagTrans[currentTagTrans] = 1\n", " prevTag = korpusPart[1]\n", " \n", " else:\n", " tagCount[''] += 1\n", " firstWord = 1\n", " \n", " idxLine = idxLine+1\n", " \n", " idxLine = idxLine + 1\n", " \n", " return tagCount, wordTag, tagTrans" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def dictProbTransition(tagTrans, tagCount):\n", " probabilityTrans = {}\n", " keys = tagTrans.keys()\n", " for tag in keys:\n", " if (tag[0], tag[1]) in tagTrans:\n", " probabilityTrans[(tag[0],tag[1])] = tagTrans[(tag[0], tag[1])]/tagCount[tag[0]]\n", " return probabilityTrans" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def dictProbEmission(wordTag, tagCount):\n", " probabilityEmission = {}\n", " wordKey = []\n", " keys = wordTag.keys()\n", " for part in keys:\n", " if((part[0],part[1])) in wordTag:\n", " probabilityEmission[(part[0],part[1])] = wordTag[(part[0],part[1])]/tagCount[(part[1])]\n", " return probabilityEmission" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('Hasil Tag\\Tag Korpus anotasi.txt', encoding='utf-8') as file:\n", " korpus = file.readlines()\n", "korpus = [req.strip() for req in korpus]\n", "tagCount, wordTag, tagTrans = frekuensi(korpus)\n", "print(wordTag)\n", "# # print(tagCount)\n", "# # print(tagTrans)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "probabilityTrans = dictProbTransition(tagTrans,tagCount)\n", "probabilityEmission = dictProbEmission(wordTag, tagCount)\n", "\n", "print(probabilityTrans)\n", "# print(probabilityEmission)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelTag = {}\n", "modelKata = []\n", "for reqs in wordTag:\n", " if reqs[1] in modelTag:\n", " modelTag[reqs[1]] += 1\n", " else:\n", " modelTag[reqs[1]] = 1\n", " if reqs[0] not in modelKata:\n", " modelKata.append(reqs[0])\n", "tagMax = max(modelTag, key=lambda reqs:modelTag[reqs])\n", "print(modelKata)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def findTag(probabilityEmission, reqWords):\n", " tags = []\n", " for word in probabilityEmission:\n", " if reqWords == word[0]:\n", " tags.append(word[1])\n", " return tags" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def viterbi(probabilityTransition, probabilityEmission, reqWords, tagCount, tagMax, modelKata):\n", " tagSequence = {}\n", " viterbiMat = {}\n", " emissionTable = {}\n", " transitionTable = {}\n", " tagArray = []\n", " temp = {}\n", " \n", " keysEmission = probabilityEmission.keys()\n", " keysTrans = probabilityTransition.keys()\n", " keysCount = tagCount.keys()\n", " \n", " for i in range(len(reqWords)):\n", " if reqWords[i] not in modelKata:\n", " modelKata.append(reqWords[i])\n", " probabilityEmission[(reqWords[i],tagMax)] = 1/tagCount[tagMax]\n", " \n", " tagArray = list(set(tagArray+findTag(probabilityEmission,reqWords[i])))\n", "\n", " for tag in tagArray:\n", " if ('',tag) in keysTrans:\n", " transitionTable[('',tag)] = probabilityTrans[('',tag)]\n", " else:\n", " transitionTable[('',tag)] = 0\n", " \n", " for tag1 in tagArray:\n", " for tag2 in tagArray: \n", " if (tag1, tag2) in keysTrans:\n", " transitionTable[(tag1,tag2)] = probabilityTrans[(tag1,tag2)]\n", " else:\n", " transitionTable[(tag1,tag2)] = 0\n", " \n", " for req in reqWords:\n", " for tag in tagArray:\n", " if (req,tag) in probabilityEmission:\n", " emissionTable[(req,tag)] = probabilityEmission[(req,tag)]\n", " else:\n", " emissionTable[(req,tag)] = 0\n", " \n", " for i in range(0,len(reqWords)):\n", " if i==0:\n", " for tag in tagArray:\n", " viterbiMat[(reqWords[i],tag)] = emissionTable[(reqWords[i],tag)]*transitionTable[('',tag)]\n", " tagSequence[(reqWords[i],tag)] = tag\n", " else:\n", " for tag1 in tagArray:\n", " newScore = {}\n", " for tag2 in tagArray:\n", " newScore[(reqWords[i-1],tag2)] = viterbiMat[(reqWords[i-1],tag2)]*transitionTable[(tag2,tag1)]\n", " max_path = max(newScore, key=lambda k: newScore[k])\n", " viterbiMat[(reqWords[i],tag1)] = newScore[max_path] * emissionTable[(reqWords[i],tag1)]\n", " tagSequence[(reqWords[i],tag1)] = max_path\n", " \n", " last_word = reqWords[len(reqWords)-1]\n", " tagg = tagArray[0]\n", " tags = [tagg]\n", " maximum = viterbiMat[(last_word,tagg)]\n", " for i in range(1,len(tagArray)):\n", " if viterbiMat[(last_word,tagArray[i])] > maximum:\n", " maximum = viterbiMat[(last_word,tagArray[i])]\n", " tags = [tagArray[i]]\n", " for i in range(len(reqWords)-2,-1,-1):\n", " tags.insert(0,tagSequence[(reqWords[i+1],tags[0])][1])\n", " \n", " return tags " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Hasil Pemberian Tag" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "viterbiResult = []\n", "for reqWords in tokenization:\n", " viterbiReqs = viterbi(probabilityTrans,probabilityEmission,reqWords, tagCount, tagMax, modelKata)\n", " viterbiResult.append(viterbiReqs)\n", "\n", "for i in range(len(viterbiResult)):\n", " print(i,viterbiResult[i])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Transformasi Data Hasil Tag ke Dokumen" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "saveFile=\"Hasil Tag/Tag \"+inputname+\".txt\"\n", "with open(saveFile,\"w+\",encoding='utf-8') as file: \n", " for i in range(len(tokenization)):\n", " file.write(\"\"+\"\\n\")\n", " for j in range(len(tokenization[i])):\n", " file.write(tokenization[i][j]+\"\\t\"+viterbiResult[i][j])\n", " file.write(\"\\n\")\n", " file.write(\"\"+\"\\n\")\n", "file.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }