Skip to content

Commit 7a2a2ea

Browse files
committed
adding simpler pickle test scripts
1 parent 06f6e50 commit 7a2a2ea

7 files changed

Lines changed: 246 additions & 9 deletions

File tree

code/ch08/ch08.ipynb

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
},
3535
{
3636
"cell_type": "code",
37-
"execution_count": 1,
37+
"execution_count": 2,
3838
"metadata": {
3939
"collapsed": false
4040
},
@@ -43,14 +43,16 @@
4343
"name": "stdout",
4444
"output_type": "stream",
4545
"text": [
46+
"The watermark extension is already loaded. To reload it, use:\n",
47+
" %reload_ext watermark\n",
4648
"Sebastian Raschka \n",
47-
"last updated: 2016-06-05 \n",
49+
"last updated: 2016-06-30 \n",
4850
"\n",
4951
"CPython 3.5.1\n",
5052
"IPython 4.2.0\n",
5153
"\n",
5254
"numpy 1.11.0\n",
53-
"pandas 0.18.0\n",
55+
"pandas 0.18.1\n",
5456
"matplotlib 1.5.1\n",
5557
"scikit-learn 0.17.1\n",
5658
"nltk 3.2.1\n"
@@ -354,7 +356,7 @@
354356
},
355357
{
356358
"cell_type": "code",
357-
"execution_count": 15,
359+
"execution_count": 3,
358360
"metadata": {
359361
"collapsed": false
360362
},

code/ch09/ch09.ipynb

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,33 @@
226226
"next(stream_docs(path='./movie_data.csv'))"
227227
]
228228
},
229+
{
230+
"cell_type": "markdown",
231+
"metadata": {},
232+
"source": [
233+
"### Note\n",
234+
"\n",
235+
"The pickling-section may be a bit tricky so that I included simpler test scripts in this directory (pickle-test-scripts/) to check if your environment is set up correctly. Basically, it is just a trimmed-down version of the relevant sections from Ch08, including a very small movie_review_data subset.\n",
236+
"\n",
237+
"Executing\n",
238+
"\n",
239+
" python pickle-dump-test.py\n",
240+
"\n",
241+
"will train a small classification model from the `movie_data_small.csv` and create the 2 pickle files \n",
242+
"\n",
243+
" stopwords.pkl\n",
244+
" classifier.pkl\n",
245+
"\n",
246+
"Next, if you execute\n",
247+
"\n",
248+
" python pickle-load-test.py\n",
249+
"\n",
250+
"You should see the following 2 lines as output:\n",
251+
"\n",
252+
" Prediction: positive\n",
253+
" Probability: 85.71%"
254+
]
255+
},
229256
{
230257
"cell_type": "markdown",
231258
"metadata": {},
@@ -1014,21 +1041,21 @@
10141041
],
10151042
"metadata": {
10161043
"kernelspec": {
1017-
"display_name": "Python 2",
1044+
"display_name": "Python 3",
10181045
"language": "python",
1019-
"name": "python2"
1046+
"name": "python3"
10201047
},
10211048
"language_info": {
10221049
"codemirror_mode": {
10231050
"name": "ipython",
1024-
"version": 2
1051+
"version": 3
10251052
},
10261053
"file_extension": ".py",
10271054
"mimetype": "text/x-python",
10281055
"name": "python",
10291056
"nbconvert_exporter": "python",
1030-
"pygments_lexer": "ipython2",
1031-
"version": "2.7.11"
1057+
"pygments_lexer": "ipython3",
1058+
"version": "3.5.1"
10321059
}
10331060
},
10341061
"nbformat": 4,
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
**Note**
2+
3+
The pickling-section may be a bit tricky so that I included simpler test scripts in this directory (pickle-test-scripts/) to check if your environment is set up correctly. Basically, it is just a trimmed-down version of the relevant sections from Ch08, including a very small movie_review_data subset.
4+
5+
Executing
6+
7+
python pickle-dump-test.py
8+
9+
will train a small classification model from the `movie_data_small.csv` and create the 2 pickle files
10+
11+
stopwords.pkl
12+
classifier.pkl
13+
14+
Next, if you execute
15+
16+
python pickle-load-test.py
17+
18+
You should see the following 2 lines as output:
19+
20+
Prediction: positive
21+
Probability: 85.71%

code/ch09/pickle-test-scripts/movie_data_small.csv

Lines changed: 102 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import pickle
2+
import os
3+
import re
4+
import pandas as pd
5+
from nltk.corpus import stopwords
6+
from sklearn.feature_extraction.text import HashingVectorizer
7+
from sklearn.linear_model import SGDClassifier
8+
9+
10+
stop = stopwords.words('english')
11+
12+
13+
def tokenizer(text):
14+
text = re.sub('<[^>]*>', '', text)
15+
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
16+
text = re.sub('[\W]+', ' ', text.lower()) +\
17+
' '.join(emoticons).replace('-', '')
18+
tokenized = [w for w in text.split() if w not in stop]
19+
return tokenized
20+
21+
vect = HashingVectorizer(decode_error='ignore',
22+
n_features=2**21,
23+
preprocessor=None,
24+
tokenizer=tokenizer)
25+
26+
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
27+
28+
29+
df = pd.read_csv('./movie_data_small.csv', encoding='utf-8')
30+
31+
#df.loc[:100, :].to_csv('./movie_data_small.csv', index=None)
32+
33+
34+
X_train = df['review'].values
35+
y_train = df['sentiment'].values
36+
37+
X_train = vect.transform(X_train)
38+
clf.fit(X_train, y_train)
39+
40+
pickle.dump(stop,
41+
open('stopwords.pkl', 'wb'),
42+
protocol=4)
43+
44+
pickle.dump(clf,
45+
open('classifier.pkl', 'wb'),
46+
protocol=4)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import pickle
2+
import re
3+
import os
4+
from vectorizer import vect
5+
import numpy as np
6+
7+
clf = pickle.load(open('classifier.pkl', 'rb'))
8+
9+
10+
label = {0: 'negative', 1: 'positive'}
11+
example = ['I love this movie']
12+
13+
X = vect.transform(example)
14+
15+
print('Prediction: %s\nProbability: %.2f%%' %
16+
(label[clf.predict(X)[0]],
17+
np.max(clf.predict_proba(X)) * 100))
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from sklearn.feature_extraction.text import HashingVectorizer
2+
import re
3+
import os
4+
import pickle
5+
6+
7+
stop = pickle.load(open('stopwords.pkl', 'rb'))
8+
9+
10+
def tokenizer(text):
11+
text = re.sub('<[^>]*>', '', text)
12+
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
13+
text.lower())
14+
text = re.sub('[\W]+', ' ', text.lower()) + \
15+
' '.join(emoticons).replace('-', '')
16+
tokenized = [w for w in text.split() if w not in stop]
17+
return tokenized
18+
19+
vect = HashingVectorizer(decode_error='ignore',
20+
n_features=2**21,
21+
preprocessor=None,
22+
tokenizer=tokenizer)

0 commit comments

Comments
 (0)