{ "cells": [ { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "#data input output\n", "import pandas as pd\n", "import nltk\n", "from nltk import word_tokenize, pos_tag, pos_tag_sents\n", "\n", "from pandas import ExcelWriter\n", "import numpy as np\n", "from pprint import pprint\n", "from sklearn.svm import LinearSVC, SVC\n", "from sklearn.pipeline import Pipeline\n", "from seqeval.metrics import classification_report as seq_met\n", "from sklearn.metrics import classification_report as skl_met\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction import DictVectorizer\n", "\n", "from sklearn.naive_bayes import MultinomialNB" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('testone.csv')" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "df.to_csv('output.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChapterVerseTokenPOSLabel
011InINO
111theDTO
211nameNNO
311ofINO
411AllahNNPO
\n", "
" ], "text/plain": [ " Chapter Verse Token POS Label\n", "0 1 1 In IN O\n", "1 1 1 the DT O\n", "2 1 1 name NN O\n", "3 1 1 of IN O\n", "4 1 1 Allah NNP O" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "493" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "#drop kolom surah\n", "df.drop('Surah',axis=1,inplace=True) \n", "\n", "#punctuation\n", "df['Text'] = df['Text'].str.replace('[^\\w\\s]','')\n", "\n", "#tokenization\n", "df['Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)\n", " \n", "#post_tag\n", "a = df['Text']\n", "df['Text'] = list(map(nltk.pos_tag,a))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "493" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "bag1 = df['Text']\n", "bag2 =df['Verse']\n", "bag3 = df['Chapter']\n", "\n", "jadi = []\n", "for i in range(len(bag2)):\n", " for token in bag1[i]:\n", " jadi.append([bag3[i],bag2[i], token[0],token[1]])\n", "\n", "#[[c, i, x[0], x[1]] for c in bag3 for i in bag2 for token in bag1 for x in token]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "dfs = pd.DataFrame(jadi)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dfs.to_csv('output.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# START HERE" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "import re\n", "from nltk.util import ngrams\n", "\n", "# s = s.lower()\n", "# s = re.sub(r'[^a-zA-Z0-9\\s]', ' ', s)\n", "# tokens = [token for token in s.split(\" \") if token != \"\"]\n", "# output = list(ngrams(tokens, 2))" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('dataset fix.csv')" ] }, { "cell_type": "code", "execution_count": 190, "metadata": {}, "outputs": [], "source": [ "class SentenceGetter(object):\n", " def __init__(self, data):\n", " self.n_sent= 1\n", " self.data = data\n", " self.empty = False\n", " \n", " agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['Token'].values.tolist(), \n", " s['POS'].values.tolist(),\n", " s['Label'].values.tolist())]\n", " \n", " self.grouped = self.data.groupby('Verse').apply(agg_func)\n", " \n", " self.sentences = [s for s in self.grouped]\n", " \n", "getter = SentenceGetter(df)\n", "sentences = getter.sentences" ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [], "source": [ "def word2features(sent, i):\n", " \n", " word = sent[i][1]\n", " postag = sent[i][2]\n", "\n", " features = {\n", " 'bias': 1.0, \n", " 'word.lower()': word.lower(), \n", " 'word': word,\n", " 'word[-3:]': word[-3:],\n", " 'word[-2:]': word[-2:],\n", " 'word[-1:]': word[-1:],\n", " 'word[:3]': word[:3],\n", " 'word[:2]': word[:2],\n", " 'word[:1]': word[:1],\n", " \n", " 'word.isupper()': word.isupper(),\n", " 'word.islower()': word.islower(),\n", " \n", " 'word.istitle()': word.istitle(),\n", " \n", " 'word.isdigit()': word.isdigit(),\n", " 'postag': postag,\n", " \n", " 'word_pls_1.lower()': '', \n", " 'word_pls_1': '',\n", " 'word_pls_1[-3:]': '',\n", " 'word_pls_1[-2:]': '',\n", " 'word_pls_1[-1:]': '',\n", " 'word_pls_1[:3]': '',\n", " 'word_pls_1[:2]': '',\n", " 'word_pls_1[:1]': '',\n", " 'word_pls_1.isupper()': False,\n", " 'word_pls_1.islower()': False,\n", " 'word_pls_1.istitle()': False,\n", " 'word_pls_1.isdigit()': False,\n", " \n", " 'postag_pls_1': '',\n", " \n", " 'word_min_1.lower()': '', \n", " 'word_min_1': '',\n", " 'word_min_1[-3:]': '',\n", " 'word_min_1[-2:]': '',\n", " 'word_min_1[-1:]': '',\n", " 'word_min_1[:3]': '',\n", " 'word_min_1[:2]': '',\n", " 'word_min_1[:1]': '',\n", " 'word_min_1.isupper()': False,\n", " 'word_min_1.islower()': False,\n", " 'word_min_1.istitle()': False,\n", " 'word_min_1.isdigit()': False,\n", " 'postag_min_1': '',\n", " \n", " 'BOS': True if i == 0 else False,\n", " 'EOS': True if i == len(sent)-1 else False,\n", " }\n", " \n", " \n", " if i < len(sent)-1:\n", " word_pls_1 = sent[i+1][1]\n", "# word_pls_2 = sent[i+2][1]\n", " postag_pls_1 = sent[i+1][2]\n", " features.update({\n", " 'word_pls_1.lower()': word_pls_1.lower(), \n", " 'word_pls_1': word_pls_1,\n", "# 'word_pls_2.lower()': word_pls_2.lower(), \n", "# 'word_pls_2': word_pls_2,\n", " 'word_pls_1[-3:]': word_pls_1[-3:],\n", " 'word_pls_1[-2:]': word_pls_1[-2:],\n", " 'word_pls_1[-1:]': word_pls_1[-1:],\n", " 'word_pls_1[:3]': word_pls_1[:3],\n", " 'word_pls_1[:2]': word_pls_1[:2],\n", " 'word_pls_1[:1]': word_pls_1[:1],\n", " 'word_pls_1.isupper()': word_pls_1.isupper(),\n", " 'word_pls_1.islower()': word_pls_1.islower(),\n", " 'word_pls_1.istitle()': word_pls_1.istitle(),\n", " 'word_pls_1.isdigit()': word_pls_1.isdigit(),\n", " 'postag_pls_1': postag_pls_1\n", " })\n", " if i > 0:\n", " word_min_1 = sent[i-1][1]\n", " postag_min_1 = sent[i-1][2]\n", " features.update({\n", " 'word_min_1.lower()': word_min_1.lower(), \n", " 'word_min_1': word_min_1,\n", " \n", " 'word_min_1[-3:]': word_min_1[-3:],\n", " 'word_min_1[-2:]': word_min_1[-2:],\n", " 'word_min_1[-1:]': word_min_1[-1:],\n", " \n", " 'word_min_1[:3]': word_min_1[:3],\n", " 'word_min_1[:2]': word_min_1[:2],\n", " 'word_min_1[:1]': word_min_1[:1],\n", " \n", " 'word_min_1.isupper()': word_min_1.isupper(),\n", " 'word_min_1.islower()': word_min_1.islower(), \n", " 'word_min_1.istitle()': word_min_1.istitle(),\n", " 'word_min_1.isdigit()': word_min_1.isdigit(),\n", " 'postag_min_1': postag_min_1\n", " })\n", " \n", " return features\n", "\n", "def sent2features(sent): \n", " return [word2features(sent, i) for i in range(len(sent))]\n", "def sent2labels(sent):\n", " return [label for verse, token, postag, label in sent]\n", "\n", "#def sent2labels(sent):\n", "# return [label for title, token, postag, label, pred in sent]\n", "#def sent2labelspred(sent):\n", "# return [pred for title, token, postag, label, pred in sent]" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "X = [x for s in sentences for x in sent2features(s)]\n", "y = [x for s in sentences for x in sent2labels(s)]\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [], "source": [ "# nb = MultinomialNB(alpha=0.01)\n", "# nb.partial_fit(X_train, y_train)#, classes)" ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(memory=None,\n", " steps=[('vectorizer', DictVectorizer(dtype=, separator='=', sort=True,\n", " sparse=True)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", " verbose=0))])" ] }, "execution_count": 194, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf = Pipeline([\n", " ('vectorizer', DictVectorizer(sparse=True)),\n", " ('classifier', LinearSVC(multi_class='ovr'))])\n", "\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of Training:\n", "0.987289767621004\n", "Accuracy of Validation:\n", "0.9560975609756097\n", " precision recall f1-score support\n", "\n", " B-PER 0.85 0.83 0.84 150\n", " I-PER 0.64 0.63 0.63 190\n", "\n", " micro avg 0.73 0.72 0.73 340\n", " macro avg 0.75 0.73 0.74 340\n", "weighted avg 0.73 0.72 0.73 340\n", "\n" ] } ], "source": [ "#bigram\n", "print(\"Accuracy of Training:\")\n", "print(clf.score(X_train, y_train))\n", "print(\"Accuracy of Validation:\")\n", "print(clf.score(X_test, y_test))\n", "\n", "new_classes = ['B-PER','I-PER']\n", "\n", "print(skl_met(y_test, clf.predict(X_test), labels=new_classes))" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of Training:\n", "0.987289767621004\n", "Accuracy of Validation:\n", "0.9560975609756097\n", " precision recall f1-score support\n", "\n", " B-PER 0.85 0.83 0.84 150\n", " I-PER 0.64 0.63 0.63 190\n", "\n", " micro avg 0.73 0.72 0.73 340\n", " macro avg 0.75 0.73 0.74 340\n", "weighted avg 0.73 0.72 0.73 340\n", "\n" ] } ], "source": [ "print(\"Accuracy of Training:\")\n", "print(clf.score(X_train, y_train))\n", "print(\"Accuracy of Validation:\")\n", "print(clf.score(X_test, y_test))\n", "\n", "new_classes = ['B-PER','I-PER']\n", "\n", "print(skl_met(y_test, clf.predict(X_test), labels=new_classes))\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", "\n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", " y=B-PER\n", " \n", "\n", "\n", "top features\n", " \n", " \n", " \n", " y=I-PER\n", " \n", "\n", "\n", "top features\n", " \n", " \n", " \n", " y=O\n", " \n", "\n", "\n", "top features\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "
\n", " Weight?\n", " Feature
\n", " +0.699\n", " \n", " word_pls_1=messenger\n", "
\n", " +0.696\n", " \n", " word_pls_1[:3]=pol\n", "
\n", " +0.685\n", " \n", " word=disbeliever\n", "
\n", " +0.685\n", " \n", " word.lower()=disbeliever\n", "
\n", " +0.668\n", " \n", " word_pls_1[-3:]=ist\n", "
\n", " +0.668\n", " \n", " word_pls_1=polytheist\n", "
\n", " +0.668\n", " \n", " word_pls_1.lower()=polytheist\n", "
\n", " … 1696 more positive …\n", "
\n", " … 3514 more negative …\n", "
\n", " -0.706\n", " \n", " word[:2]=Me\n", "
\n", " -0.786\n", " \n", " postag_pls_1=CD\n", "
\n", " -0.834\n", " \n", " postag_min_1=NNPS\n", "
\n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "
\n", " Weight?\n", " Feature
\n", " +1.616\n", " \n", " postag=NNPS\n", "
\n", " +0.933\n", " \n", " postag_min_1=VBP\n", "
\n", " +0.886\n", " \n", " word.lower()=messenger\n", "
\n", " +0.883\n", " \n", " word[-3:]=ies\n", "
\n", " +0.826\n", " \n", " word[:3]=Mes\n", "
\n", " +0.779\n", " \n", " word_pls_1.lower()=deviation\n", "
\n", " +0.779\n", " \n", " word_pls_1=deviation\n", "
\n", " +0.779\n", " \n", " word_pls_1[:3]=dev\n", "
\n", " +0.748\n", " \n", " word_min_1[:3]=who\n", "
\n", " … 2503 more positive …\n", "
\n", " … 4391 more negative …\n", "
\n", " -0.774\n", " \n", " word_pls_1=then\n", "
\n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "
\n", " Weight?\n", " Feature
\n", " +0.996\n", " \n", " postag_pls_1=CD\n", "
\n", " … 5125 more positive …\n", "
\n", " … 3050 more negative …\n", "
\n", " -0.881\n", " \n", " word=worthless\n", "
\n", " -0.887\n", " \n", " word_pls_1.lower()=parties\n", "
\n", " -0.887\n", " \n", " word_pls_1=parties\n", "
\n", " -0.891\n", " \n", " word.lower()=messenger\n", "
\n", " -0.938\n", " \n", " word.lower()=companions\n", "
\n", " -0.938\n", " \n", " word=companions\n", "
\n", " -1.032\n", " \n", " word.lower()=disbeliever\n", "
\n", " -1.032\n", " \n", " word=disbeliever\n", "
\n", " -1.066\n", " \n", " postag_pls_1=NNPS\n", "
\n", "\n", " \n", " \n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", " \n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import eli5\n", "eli5.show_weights(clf, top=10)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChapterCounts
0174
1212319
237080
\n", "
" ], "text/plain": [ " Chapter Counts\n", "0 1 74\n", "1 2 12319\n", "2 3 7080" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('Chapter').size().reset_index(name='Counts')" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "# outside = df['Chapter']\n", "# inside = df['Verse']\n", "# data1 = df['Token']\n", "# data2 = df['POS']\n", "# hier_index = list(zip(outside,inside))\n", "# hier_index = pd.MultiIndex.from_tuples(hier_index)\n" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "# dff = pd.DataFrame(data1,index=hier_index,columns=['A'])\n", "# dff.head()" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "# dff['A'] = df['Token']" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19473" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "74 + 12319 +7080" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }