{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "#data input output\n",
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk import word_tokenize, pos_tag, pos_tag_sents\n",
    "\n",
    "from pandas import ExcelWriter\n",
    "import numpy as np\n",
    "from pprint import pprint\n",
    "from sklearn.svm import LinearSVC, SVC\n",
    "from sklearn.pipeline import Pipeline\n",
    "from seqeval.metrics import classification_report as seq_met\n",
    "from sklearn.metrics import classification_report as skl_met\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "\n",
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('testone.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('output.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chapter</th>\n",
       "      <th>Verse</th>\n",
       "      <th>Token</th>\n",
       "      <th>POS</th>\n",
       "      <th>Label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>In</td>\n",
       "      <td>IN</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>the</td>\n",
       "      <td>DT</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>name</td>\n",
       "      <td>NN</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>of</td>\n",
       "      <td>IN</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allah</td>\n",
       "      <td>NNP</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Chapter  Verse  Token  POS Label\n",
       "0        1      1     In   IN     O\n",
       "1        1      1    the   DT     O\n",
       "2        1      1   name   NN     O\n",
       "3        1      1     of   IN     O\n",
       "4        1      1  Allah  NNP     O"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "493"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "#drop kolom surah\n",
    "df.drop('Surah',axis=1,inplace=True) \n",
    "\n",
    "#punctuation\n",
    "df['Text'] = df['Text'].str.replace('[^\\w\\s]','')\n",
    "\n",
    "#tokenization\n",
    "df['Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)\n",
    " \n",
    "#post_tag\n",
    "a = df['Text']\n",
    "df['Text'] = list(map(nltk.pos_tag,a))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "493"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "bag1 = df['Text']\n",
    "bag2 =df['Verse']\n",
    "bag3 = df['Chapter']\n",
    "\n",
    "jadi = []\n",
    "for i in range(len(bag2)):\n",
    "    for token in bag1[i]:\n",
    "        jadi.append([bag3[i],bag2[i], token[0],token[1]])\n",
    "\n",
    "#[[c, i, x[0], x[1]] for c in bag3 for i in bag2 for token in bag1 for x in token]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = pd.DataFrame(jadi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs.to_csv('output.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# START HERE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from nltk.util import ngrams\n",
    "\n",
    "# s = s.lower()\n",
    "# s = re.sub(r'[^a-zA-Z0-9\\s]', ' ', s)\n",
    "# tokens = [token for token in s.split(\" \") if token != \"\"]\n",
    "# output = list(ngrams(tokens, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('dataset fix.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SentenceGetter(object):\n",
    "    def __init__(self, data):\n",
    "        self.n_sent= 1\n",
    "        self.data = data\n",
    "        self.empty = False\n",
    "        \n",
    "        agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['Token'].values.tolist(),    \n",
    "                                                            s['POS'].values.tolist(),\n",
    "                                                            s['Label'].values.tolist())]\n",
    "        \n",
    "        self.grouped = self.data.groupby('Verse').apply(agg_func)\n",
    "        \n",
    "        self.sentences = [s for s in self.grouped]\n",
    "        \n",
    "getter = SentenceGetter(df)\n",
    "sentences = getter.sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "def word2features(sent, i):\n",
    "    \n",
    "        word = sent[i][1]\n",
    "        postag = sent[i][2]\n",
    "\n",
    "        features = {\n",
    "             'bias': 1.0, \n",
    "             'word.lower()': word.lower(), \n",
    "             'word': word,\n",
    "             'word[-3:]': word[-3:],\n",
    "             'word[-2:]': word[-2:],\n",
    "             'word[-1:]': word[-1:],\n",
    "             'word[:3]': word[:3],\n",
    "             'word[:2]': word[:2],\n",
    "             'word[:1]': word[:1],\n",
    "            \n",
    "             'word.isupper()': word.isupper(),\n",
    "             'word.islower()': word.islower(),\n",
    "             \n",
    "             'word.istitle()': word.istitle(),\n",
    "             \n",
    "            'word.isdigit()': word.isdigit(),\n",
    "             'postag': postag,\n",
    "            \n",
    "             'word_pls_1.lower()': '', \n",
    "             'word_pls_1': '',\n",
    "             'word_pls_1[-3:]': '',\n",
    "             'word_pls_1[-2:]': '',\n",
    "             'word_pls_1[-1:]': '',\n",
    "             'word_pls_1[:3]': '',\n",
    "             'word_pls_1[:2]': '',\n",
    "             'word_pls_1[:1]': '',\n",
    "             'word_pls_1.isupper()': False,\n",
    "             'word_pls_1.islower()': False,\n",
    "             'word_pls_1.istitle()': False,\n",
    "             'word_pls_1.isdigit()': False,\n",
    "             \n",
    "             'postag_pls_1': '',\n",
    "            \n",
    "             'word_min_1.lower()': '', \n",
    "             'word_min_1': '',\n",
    "             'word_min_1[-3:]': '',\n",
    "             'word_min_1[-2:]': '',\n",
    "             'word_min_1[-1:]': '',\n",
    "             'word_min_1[:3]': '',\n",
    "             'word_min_1[:2]': '',\n",
    "             'word_min_1[:1]': '',\n",
    "             'word_min_1.isupper()': False,\n",
    "             'word_min_1.islower()': False,\n",
    "             'word_min_1.istitle()': False,\n",
    "             'word_min_1.isdigit()': False,\n",
    "             'postag_min_1': '',\n",
    "            \n",
    "             'BOS': True if i == 0 else False,\n",
    "             'EOS': True if i == len(sent)-1 else False,\n",
    "         }\n",
    "        \n",
    "          \n",
    "        if i < len(sent)-1:\n",
    "            word_pls_1 = sent[i+1][1]\n",
    "#             word_pls_2 = sent[i+2][1]\n",
    "            postag_pls_1 = sent[i+1][2]\n",
    "            features.update({\n",
    "                'word_pls_1.lower()': word_pls_1.lower(), \n",
    "                'word_pls_1': word_pls_1,\n",
    "#                 'word_pls_2.lower()': word_pls_2.lower(), \n",
    "#                 'word_pls_2': word_pls_2,\n",
    "                'word_pls_1[-3:]': word_pls_1[-3:],\n",
    "                'word_pls_1[-2:]': word_pls_1[-2:],\n",
    "                'word_pls_1[-1:]': word_pls_1[-1:],\n",
    "                'word_pls_1[:3]': word_pls_1[:3],\n",
    "                'word_pls_1[:2]': word_pls_1[:2],\n",
    "                'word_pls_1[:1]': word_pls_1[:1],\n",
    "                'word_pls_1.isupper()': word_pls_1.isupper(),\n",
    "                'word_pls_1.islower()': word_pls_1.islower(),\n",
    "                'word_pls_1.istitle()': word_pls_1.istitle(),\n",
    "                'word_pls_1.isdigit()': word_pls_1.isdigit(),\n",
    "                'postag_pls_1': postag_pls_1\n",
    "             })\n",
    "        if i > 0:\n",
    "            word_min_1 = sent[i-1][1]\n",
    "            postag_min_1 = sent[i-1][2]\n",
    "            features.update({\n",
    "                'word_min_1.lower()': word_min_1.lower(), \n",
    "                'word_min_1': word_min_1,\n",
    "                \n",
    "                'word_min_1[-3:]': word_min_1[-3:],\n",
    "                'word_min_1[-2:]': word_min_1[-2:],\n",
    "                'word_min_1[-1:]': word_min_1[-1:],\n",
    "                \n",
    "                'word_min_1[:3]': word_min_1[:3],\n",
    "                'word_min_1[:2]': word_min_1[:2],\n",
    "                'word_min_1[:1]': word_min_1[:1],\n",
    "                \n",
    "                'word_min_1.isupper()': word_min_1.isupper(),\n",
    "                'word_min_1.islower()': word_min_1.islower(),           \n",
    "                'word_min_1.istitle()': word_min_1.istitle(),\n",
    "                'word_min_1.isdigit()': word_min_1.isdigit(),\n",
    "                'postag_min_1': postag_min_1\n",
    "                })\n",
    "            \n",
    "        return features\n",
    "\n",
    "def sent2features(sent):  \n",
    "    return [word2features(sent, i) for i in range(len(sent))]\n",
    "def sent2labels(sent):\n",
    "    return [label for verse, token, postag, label in sent]\n",
    "\n",
    "#def sent2labels(sent):\n",
    "#    return [label for title, token, postag, label, pred in sent]\n",
    "#def sent2labelspred(sent):\n",
    "#    return [pred for title, token, postag, label, pred in sent]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = [x for s in sentences for x in sent2features(s)]\n",
    "y = [x for s in sentences for x in sent2labels(s)]\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "# nb = MultinomialNB(alpha=0.01)\n",
    "# nb.partial_fit(X_train, y_train)#, classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,\n",
       "        sparse=True)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
       "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
       "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
       "     verbose=0))])"
      ]
     },
     "execution_count": 194,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = Pipeline([\n",
    "    ('vectorizer', DictVectorizer(sparse=True)),\n",
    "    ('classifier', LinearSVC(multi_class='ovr'))])\n",
    "\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of Training:\n",
      "0.987289767621004\n",
      "Accuracy of Validation:\n",
      "0.9560975609756097\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       B-PER       0.85      0.83      0.84       150\n",
      "       I-PER       0.64      0.63      0.63       190\n",
      "\n",
      "   micro avg       0.73      0.72      0.73       340\n",
      "   macro avg       0.75      0.73      0.74       340\n",
      "weighted avg       0.73      0.72      0.73       340\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#bigram\n",
    "print(\"Accuracy of Training:\")\n",
    "print(clf.score(X_train, y_train))\n",
    "print(\"Accuracy of Validation:\")\n",
    "print(clf.score(X_test, y_test))\n",
    "\n",
    "new_classes = ['B-PER','I-PER']\n",
    "\n",
    "print(skl_met(y_test, clf.predict(X_test), labels=new_classes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of Training:\n",
      "0.987289767621004\n",
      "Accuracy of Validation:\n",
      "0.9560975609756097\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       B-PER       0.85      0.83      0.84       150\n",
      "       I-PER       0.64      0.63      0.63       190\n",
      "\n",
      "   micro avg       0.73      0.72      0.73       340\n",
      "   macro avg       0.75      0.73      0.74       340\n",
      "weighted avg       0.73      0.72      0.73       340\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"Accuracy of Training:\")\n",
    "print(clf.score(X_train, y_train))\n",
    "print(\"Accuracy of Validation:\")\n",
    "print(clf.score(X_test, y_test))\n",
    "\n",
    "new_classes = ['B-PER','I-PER']\n",
    "\n",
    "print(skl_met(y_test, clf.predict(X_test), labels=new_classes))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <style>\n",
       "    table.eli5-weights tr:hover {\n",
       "        filter: brightness(85%);\n",
       "    }\n",
       "</style>\n",
       "\n",
       "\n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "        \n",
       "\n",
       "    \n",
       "        <table class=\"eli5-weights-wrapper\" style=\"border-collapse: collapse; border: none; margin-bottom: 1.5em;\">\n",
       "            <tr>\n",
       "                \n",
       "                    <td style=\"padding: 0.5em; border: 1px solid black; text-align: center;\">\n",
       "                        <b>\n",
       "    \n",
       "        y=B-PER\n",
       "    \n",
       "</b>\n",
       "\n",
       "top features\n",
       "                    </td>\n",
       "                \n",
       "                    <td style=\"padding: 0.5em; border: 1px solid black; text-align: center;\">\n",
       "                        <b>\n",
       "    \n",
       "        y=I-PER\n",
       "    \n",
       "</b>\n",
       "\n",
       "top features\n",
       "                    </td>\n",
       "                \n",
       "                    <td style=\"padding: 0.5em; border: 1px solid black; text-align: center;\">\n",
       "                        <b>\n",
       "    \n",
       "        y=O\n",
       "    \n",
       "</b>\n",
       "\n",
       "top features\n",
       "                    </td>\n",
       "                \n",
       "            </tr>\n",
       "            <tr>\n",
       "                \n",
       "                    \n",
       "                        <td style=\"padding: 0px; border: 1px solid black; vertical-align: top;\">\n",
       "                            \n",
       "                                \n",
       "                                    \n",
       "                                    \n",
       "    \n",
       "    <table class=\"eli5-weights\"\n",
       "           style=\"border-collapse: collapse; border: none; margin-top: 0em; table-layout: auto; width: 100%;\">\n",
       "        <thead>\n",
       "        <tr style=\"border: none;\">\n",
       "            \n",
       "                <th style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\" title=\"Feature weights. Note that weights do not account for feature value scales, so if feature values have different scales, features with highest weights might not be the most important.\">\n",
       "                    Weight<sup>?</sup>\n",
       "                </th>\n",
       "            \n",
       "            <th style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">Feature</th>\n",
       "            \n",
       "        </tr>\n",
       "        </thead>\n",
       "        <tbody>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.88%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.699\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1=messenger\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.92%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.696\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1[:3]=pol\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 89.03%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.685\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word=disbeliever\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 89.03%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.685\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word.lower()=disbeliever\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 89.23%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.668\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1[-3:]=ist\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 89.23%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.668\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1=polytheist\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 89.23%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.668\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1.lower()=polytheist\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 89.23%); border: none;\">\n",
       "                <td colspan=\"2\" style=\"padding: 0 0.5em 0 0.5em; text-align: center; border: none; white-space: nowrap;\">\n",
       "                    <i>&hellip; 1696 more positive &hellip;</i>\n",
       "                </td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 88.80%); border: none;\">\n",
       "                <td colspan=\"2\" style=\"padding: 0 0.5em 0 0.5em; text-align: center; border: none; white-space: nowrap;\">\n",
       "                    <i>&hellip; 3514 more negative &hellip;</i>\n",
       "                </td>\n",
       "            </tr>\n",
       "        \n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 88.80%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.706\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word[:2]=Me\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 87.93%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.786\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        postag_pls_1=CD\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 87.41%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.834\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        postag_min_1=NNPS\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "\n",
       "        </tbody>\n",
       "    </table>\n",
       "\n",
       "                                \n",
       "                            \n",
       "                        </td>\n",
       "                    \n",
       "                        <td style=\"padding: 0px; border: 1px solid black; vertical-align: top;\">\n",
       "                            \n",
       "                                \n",
       "                                    \n",
       "                                    \n",
       "    \n",
       "    <table class=\"eli5-weights\"\n",
       "           style=\"border-collapse: collapse; border: none; margin-top: 0em; table-layout: auto; width: 100%;\">\n",
       "        <thead>\n",
       "        <tr style=\"border: none;\">\n",
       "            \n",
       "                <th style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\" title=\"Feature weights. Note that weights do not account for feature value scales, so if feature values have different scales, features with highest weights might not be the most important.\">\n",
       "                    Weight<sup>?</sup>\n",
       "                </th>\n",
       "            \n",
       "            <th style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">Feature</th>\n",
       "            \n",
       "        </tr>\n",
       "        </thead>\n",
       "        <tbody>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 80.00%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +1.616\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        postag=NNPS\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 86.38%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.933\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        postag_min_1=VBP\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 86.87%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.886\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word.lower()=messenger\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 86.90%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.883\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word[-3:]=ies\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 87.50%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.826\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word[:3]=Mes\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.00%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.779\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1.lower()=deviation\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.00%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.779\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1=deviation\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.00%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.779\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1[:3]=dev\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.33%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.748\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_min_1[:3]=who\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 88.33%); border: none;\">\n",
       "                <td colspan=\"2\" style=\"padding: 0 0.5em 0 0.5em; text-align: center; border: none; white-space: nowrap;\">\n",
       "                    <i>&hellip; 2503 more positive &hellip;</i>\n",
       "                </td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 88.06%); border: none;\">\n",
       "                <td colspan=\"2\" style=\"padding: 0 0.5em 0 0.5em; text-align: center; border: none; white-space: nowrap;\">\n",
       "                    <i>&hellip; 4391 more negative &hellip;</i>\n",
       "                </td>\n",
       "            </tr>\n",
       "        \n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 88.06%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.774\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1=then\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "\n",
       "        </tbody>\n",
       "    </table>\n",
       "\n",
       "                                \n",
       "                            \n",
       "                        </td>\n",
       "                    \n",
       "                        <td style=\"padding: 0px; border: 1px solid black; vertical-align: top;\">\n",
       "                            \n",
       "                                \n",
       "                                    \n",
       "                                    \n",
       "    \n",
       "    <table class=\"eli5-weights\"\n",
       "           style=\"border-collapse: collapse; border: none; margin-top: 0em; table-layout: auto; width: 100%;\">\n",
       "        <thead>\n",
       "        <tr style=\"border: none;\">\n",
       "            \n",
       "                <th style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\" title=\"Feature weights. Note that weights do not account for feature value scales, so if feature values have different scales, features with highest weights might not be the most important.\">\n",
       "                    Weight<sup>?</sup>\n",
       "                </th>\n",
       "            \n",
       "            <th style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">Feature</th>\n",
       "            \n",
       "        </tr>\n",
       "        </thead>\n",
       "        <tbody>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 85.75%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        +0.996\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        postag_pls_1=CD\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "        \n",
       "            <tr style=\"background-color: hsl(120, 100.00%, 85.75%); border: none;\">\n",
       "                <td colspan=\"2\" style=\"padding: 0 0.5em 0 0.5em; text-align: center; border: none; white-space: nowrap;\">\n",
       "                    <i>&hellip; 5125 more positive &hellip;</i>\n",
       "                </td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.92%); border: none;\">\n",
       "                <td colspan=\"2\" style=\"padding: 0 0.5em 0 0.5em; text-align: center; border: none; white-space: nowrap;\">\n",
       "                    <i>&hellip; 3050 more negative &hellip;</i>\n",
       "                </td>\n",
       "            </tr>\n",
       "        \n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.92%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.881\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word=worthless\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.86%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.887\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1.lower()=parties\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.86%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.887\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word_pls_1=parties\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.82%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.891\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word.lower()=messenger\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.34%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.938\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word.lower()=companions\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 86.34%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -0.938\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word=companions\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 85.39%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -1.032\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word.lower()=disbeliever\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 85.39%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -1.032\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        word=disbeliever\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "            <tr style=\"background-color: hsl(0, 100.00%, 85.05%); border: none;\">\n",
       "    <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
       "        -1.066\n",
       "    </td>\n",
       "    <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
       "        postag_pls_1=NNPS\n",
       "    </td>\n",
       "    \n",
       "</tr>\n",
       "        \n",
       "\n",
       "        </tbody>\n",
       "    </table>\n",
       "\n",
       "                                \n",
       "                            \n",
       "                        </td>\n",
       "                    \n",
       "                \n",
       "            </tr>\n",
       "        </table>\n",
       "    \n",
       "\n",
       "    \n",
       "        \n",
       "\n",
       "\n",
       "    \n",
       "        \n",
       "\n",
       "\n",
       "    \n",
       "        \n",
       "\n",
       "\n",
       "    \n",
       "\n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "    \n",
       "\n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import eli5\n",
    "eli5.show_weights(clf, top=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chapter</th>\n",
       "      <th>Counts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>12319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>7080</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Chapter  Counts\n",
       "0        1      74\n",
       "1        2   12319\n",
       "2        3    7080"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('Chapter').size().reset_index(name='Counts')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "# outside = df['Chapter']\n",
    "# inside = df['Verse']\n",
    "# data1 = df['Token']\n",
    "# data2 = df['POS']\n",
    "# hier_index = list(zip(outside,inside))\n",
    "# hier_index = pd.MultiIndex.from_tuples(hier_index)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dff = pd.DataFrame(data1,index=hier_index,columns=['A'])\n",
    "# dff.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dff['A'] = df['Token']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19473"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "74 + 12319 +7080"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}