adding simpler pickle test scripts

rasbt · rasbt · commit 7a2a2ea4bd1d · 2016-06-30T15:57:49.000-04:00
diff --git a/code/ch08/ch08.ipynb b/code/ch08/ch08.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
@@ -43,14 +43,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "The watermark extension is already loaded. To reload it, use:\n",
+      "  %reload_ext watermark\n",
       "Sebastian Raschka \n",
-      "last updated: 2016-06-05 \n",
+      "last updated: 2016-06-30 \n",
       "\n",
       "CPython 3.5.1\n",
       "IPython 4.2.0\n",
       "\n",
       "numpy 1.11.0\n",
-      "pandas 0.18.0\n",
+      "pandas 0.18.1\n",
       "matplotlib 1.5.1\n",
       "scikit-learn 0.17.1\n",
       "nltk 3.2.1\n"
@@ -354,7 +356,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
diff --git a/code/ch09/ch09.ipynb b/code/ch09/ch09.ipynb
@@ -226,6 +226,33 @@
     "next(stream_docs(path='./movie_data.csv'))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Note\n",
+    "\n",
+    "The pickling-section may be a bit tricky so that I included simpler test scripts in this directory (pickle-test-scripts/) to check if your environment is set up correctly. Basically, it is just a trimmed-down version of the relevant sections from Ch08, including a very small movie_review_data subset.\n",
+    "\n",
+    "Executing\n",
+    "\n",
+    "    python pickle-dump-test.py\n",
+    "\n",
+    "will train a small classification model from the `movie_data_small.csv` and create the 2 pickle files \n",
+    "\n",
+    "    stopwords.pkl\n",
+    "    classifier.pkl\n",
+    "\n",
+    "Next, if you execute\n",
+    "\n",
+    "    python pickle-load-test.py\n",
+    "\n",
+    "You should see the following 2 lines as output:\n",
+    "\n",
+    "    Prediction: positive\n",
+    "    Probability: 85.71%"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1014,21 +1041,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
   }
  },
  "nbformat": 4,
diff --git a/code/ch09/pickle-test-scripts/README.md b/code/ch09/pickle-test-scripts/README.md
@@ -0,0 +1,21 @@
+**Note**
+
+The pickling-section may be a bit tricky so that I included simpler test scripts in this directory (pickle-test-scripts/) to check if your environment is set up correctly. Basically, it is just a trimmed-down version of the relevant sections from Ch08, including a very small movie_review_data subset.
+
+Executing
+
+    python pickle-dump-test.py
+
+will train a small classification model from the `movie_data_small.csv` and create the 2 pickle files 
+
+    stopwords.pkl
+    classifier.pkl
+
+Next, if you execute
+
+    python pickle-load-test.py
+
+You should see the following 2 lines as output:
+
+    Prediction: positive
+    Probability: 85.71%
diff --git a/code/ch09/pickle-test-scripts/movie_data_small.csv b/code/ch09/pickle-test-scripts/movie_data_small.csv
diff --git a/code/ch09/pickle-test-scripts/pickle-dump-test.py b/code/ch09/pickle-test-scripts/pickle-dump-test.py
@@ -0,0 +1,46 @@
+import pickle
+import os
+import re
+import pandas as pd
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.linear_model import SGDClassifier
+
+
+stop = stopwords.words('english')
+
+
+def tokenizer(text):
+    text = re.sub('<[^>]*>', '', text)
+    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
+    text = re.sub('[\W]+', ' ', text.lower()) +\
+        ' '.join(emoticons).replace('-', '')
+    tokenized = [w for w in text.split() if w not in stop]
+    return tokenized
+
+vect = HashingVectorizer(decode_error='ignore',
+                         n_features=2**21,
+                         preprocessor=None,
+                         tokenizer=tokenizer)
+
+clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
+
+
+df = pd.read_csv('./movie_data_small.csv', encoding='utf-8')
+
+#df.loc[:100, :].to_csv('./movie_data_small.csv', index=None)
+
+
+X_train = df['review'].values
+y_train = df['sentiment'].values
+
+X_train = vect.transform(X_train)
+clf.fit(X_train, y_train)
+
+pickle.dump(stop,
+            open('stopwords.pkl', 'wb'),
+            protocol=4)
+
+pickle.dump(clf,
+            open('classifier.pkl', 'wb'),
+            protocol=4)
diff --git a/code/ch09/pickle-test-scripts/pickle-load-test.py b/code/ch09/pickle-test-scripts/pickle-load-test.py
@@ -0,0 +1,17 @@
+import pickle
+import re
+import os
+from vectorizer import vect
+import numpy as np
+
+clf = pickle.load(open('classifier.pkl', 'rb'))
+
+
+label = {0: 'negative', 1: 'positive'}
+example = ['I love this movie']
+
+X = vect.transform(example)
+
+print('Prediction: %s\nProbability: %.2f%%' %
+      (label[clf.predict(X)[0]],
+       np.max(clf.predict_proba(X)) * 100))
diff --git a/code/ch09/pickle-test-scripts/vectorizer.py b/code/ch09/pickle-test-scripts/vectorizer.py
@@ -0,0 +1,22 @@
+from sklearn.feature_extraction.text import HashingVectorizer
+import re
+import os
+import pickle
+
+
+stop = pickle.load(open('stopwords.pkl', 'rb'))
+
+
+def tokenizer(text):
+    text = re.sub('<[^>]*>', '', text)
+    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
+                           text.lower())
+    text = re.sub('[\W]+', ' ', text.lower()) + \
+        ' '.join(emoticons).replace('-', '')
+    tokenized = [w for w in text.split() if w not in stop]
+    return tokenized
+
+vect = HashingVectorizer(decode_error='ignore',
+                         n_features=2**21,
+                         preprocessor=None,
+                         tokenizer=tokenizer)