chiphuyen
diff --git a/‎assignments/chatbot/README.md‎
Lines changed: 45 additions & 1 deletion b/‎assignments/chatbot/README.md‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎assignments/chatbot/chatbot.py‎
Lines changed: 262 additions & 0 deletions b/‎assignments/chatbot/chatbot.py‎
Lines changed: 262 additions & 0 deletions
diff --git a/‎assignments/chatbot/config.py‎
Lines changed: 60 additions & 0 deletions b/‎assignments/chatbot/config.py‎
Lines changed: 60 additions & 0 deletions
@@ -1 +1,45 @@
-You can see sample conversations in output_convo.txt
+A neural chatbot using sequence to sequence model with
+attentional decoder. 
+
+This is based on Google Translate Tensorflow model 
+https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
+
+Sequence to sequence model by Cho et al.(2014)
+
+Created by Chip Huyen as the starter code for assignment 3,
+class CS 20SI: "TensorFlow for Deep Learning Research"
+cs20si.stanford.edu
+
+The detailed assignment handout can be found at: (not updated)
+
+See output_convo.txt for sample conversations.
+
+Usage:
+Step 1: create a data folder in your project directory, download
+the Cornell Movie-Dialogs Corpus from 
+https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
+Unzip it
+
+Step 2: python data.py
+This will do all the pre-processing for the Cornell dataset.
+
+Step 3:
+python chatbot.py --mode [train/chat]
+If mode is train, then you train the chatbot. By default, the model will
+restore the previously trained weights (if there is any) and continue
+training up on that.
+
+If you want to start training from scratch, please delete all the checkpoints
+in the checkpoints folder.
+
+If the mode is chat, you'll go into the interaction mode with the bot.
+
+By default, all the conversations you have with the chatbot will be written
+into the file output_convo.txt in the processed folder. If you run this chatbot,
+I kindly ask you to send me the output_convo.txt so that I can improve
+the chatbot. My email is huyenn@stanford.edu
+
+If you find the tutorial helpful, please head over to Anonymous Chatlog Donation
+to see how you can help us create the first realistic dialogue dataset.
+
+Thank you very much!
@@ -0,0 +1,262 @@
+""" A neural chatbot using sequence to sequence model with
+attentional decoder. 
+
+This is based on Google Translate Tensorflow model 
+https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
+
+Sequence to sequence model by Cho et al.(2014)
+
+Created by Chip Huyen as the starter code for assignment 3,
+class CS 20SI: "TensorFlow for Deep Learning Research"
+cs20si.stanford.edu
+
+This file contains the code to run the model.
+
+See readme.md for instruction on how to run the starter code.
+"""
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import random
+import sys
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from model import ChatBotModel
+import config
+import data
+
+def _get_random_bucket(train_buckets_scale):
+    """ Get a random bucket from which to choose a training sample """
+    rand = random.random()
+    return min([i for i in xrange(len(train_buckets_scale))
+                if train_buckets_scale[i] > rand])
+
+def _assert_lengths(encoder_size, decoder_size, encoder_inputs, decoder_inputs, decoder_masks):
+    """ Assert that the encoder inputs, decoder inputs, and decoder masks are
+    of the expected lengths """
+    if len(encoder_inputs) != encoder_size:
+        raise ValueError("Encoder length must be equal to the one in bucket,"
+                        " %d != %d." % (len(encoder_inputs), encoder_size))
+    if len(decoder_inputs) != decoder_size:
+        raise ValueError("Decoder length must be equal to the one in bucket,"
+                       " %d != %d." % (len(decoder_inputs), decoder_size))
+    if len(decoder_masks) != decoder_size:
+        raise ValueError("Weights length must be equal to the one in bucket,"
+                       " %d != %d." % (len(decoder_masks), decoder_size))
+
+def run_step(sess, model, encoder_inputs, decoder_inputs, decoder_masks, bucket_id, forward_only):
+    """ Run one step in training.
+    @forward_only: boolean value to decide whether a backward path should be created
+    forward_only is set to True when you just want to evaluate on the test set,
+    or when you want to the bot to be in chat mode. """
+    encoder_size, decoder_size = config.BUCKETS[bucket_id]
+    _assert_lengths(encoder_size, decoder_size, encoder_inputs, decoder_inputs, decoder_masks)
+
+    # input feed: encoder inputs, decoder inputs, target_weights, as provided.
+    input_feed = {}
+    for step in xrange(encoder_size):
+        input_feed[model.encoder_inputs[step].name] = encoder_inputs[step]
+    for step in xrange(decoder_size):
+        input_feed[model.decoder_inputs[step].name] = decoder_inputs[step]
+        input_feed[model.decoder_masks[step].name] = decoder_masks[step]
+
+    last_target = model.decoder_inputs[decoder_size].name
+    input_feed[last_target] = np.zeros([model.batch_size], dtype=np.int32)
+
+    # output feed: depends on whether we do a backward step or not.
+    if not forward_only:
+        output_feed = [model.train_ops[bucket_id],  # update op that does SGD.
+                       model.gradient_norms[bucket_id],  # gradient norm.
+                       model.losses[bucket_id]]  # loss for this batch.
+    else:
+        output_feed = [model.losses[bucket_id]]  # loss for this batch.
+        for step in xrange(decoder_size):  # output logits.
+            output_feed.append(model.outputs[bucket_id][step])
+
+    outputs = sess.run(output_feed, input_feed)
+    if not forward_only:
+        return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
+    else:
+        return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.
+
+def _get_buckets():
+    """ Load the dataset into buckets based on their lengths.
+    train_buckets_scale is the inverval that'll help us 
+    choose a random bucket later on.
+    """
+    test_buckets = data.load_data('test_ids.enc', 'test_ids.dec')
+    data_buckets = data.load_data('train_ids.enc', 'train_ids.dec')
+    train_bucket_sizes = [len(data_buckets[b]) for b in xrange(len(config.BUCKETS))]
+    print("Number of samples in each bucket:\n", train_bucket_sizes)
+    train_total_size = sum(train_bucket_sizes)
+    # list of increasing numbers from 0 to 1 that we'll use to select a bucket.
+    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
+                           for i in xrange(len(train_bucket_sizes))]
+    print("Bucket scale:\n", train_buckets_scale)
+    return test_buckets, data_buckets, train_buckets_scale
+
+def _get_skip_step(iteration):
+    """ How many steps should the model train before it saves all the weights. """
+    if iteration < 100:
+        return 30
+    return 100
+
+def _check_restore_parameters(sess, saver):
+    """ Restore the previously trained parameters if there are any. """
+    ckpt = tf.train.get_checkpoint_state(os.path.dirname(config.CPT_PATH + '/checkpoint'))
+    if ckpt and ckpt.model_checkpoint_path:
+        print("Loading parameters for the Chatbot")
+        saver.restore(sess, ckpt.model_checkpoint_path)
+    else:
+        print("Initializing fresh parameters for the Chatbot")
+
+def _eval_test_set(sess, model, test_buckets):
+    """ Evaluate on the test set. """
+    for bucket_id in xrange(len(config.BUCKETS)):
+        if len(test_buckets[bucket_id]) == 0:
+            print("  Test: empty bucket %d" % (bucket_id))
+            continue
+        start = time.time()
+        encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(test_buckets[bucket_id], 
+                                                                        bucket_id,
+                                                                        batch_size=config.BATCH_SIZE)
+        _, step_loss, _ = run_step(sess, model, encoder_inputs, decoder_inputs, 
+                                   decoder_masks, bucket_id, True)
+        print('Test bucket {}: loss {}, time {}'.format(bucket_id, step_loss, time.time() - start))
+
+def train():
+    """ Train the bot """
+    test_buckets, data_buckets, train_buckets_scale = _get_buckets()
+    # in train mode, we need to create the backward path, so forwrad_only is False
+    model = ChatBotModel(False, config.BATCH_SIZE)
+    model.build_graph()
+
+    saver = tf.train.Saver()
+    initial_step = 0
+
+    with tf.Session() as sess:
+        print('Running session')
+        sess.run(tf.global_variables_initializer())
+        _check_restore_parameters(sess, saver)
+
+        iteration = model.global_step.eval()
+        total_loss = 0
+        while True:
+            skip_step = _get_skip_step(iteration)
+            bucket_id = _get_random_bucket(train_buckets_scale)
+            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(data_buckets[bucket_id], 
+                                                                           bucket_id,
+                                                                           batch_size=config.BATCH_SIZE)
+            start = time.time()
+            _, step_loss, _ = run_step(sess, model, encoder_inputs, decoder_inputs, decoder_masks, bucket_id, False)
+            total_loss += step_loss
+            iteration += 1
+
+            if iteration % skip_step == 0:
+                print('Iter {}: loss {}, time {}'.format(iteration, total_loss/skip_step, time.time() - start))
+                start = time.time()
+                total_loss = 0
+                saver.save(sess, os.path.join(config.CPT_PATH, 'chatbot'), global_step=model.global_step)
+                if iteration % (10 * skip_step) == 0:
+                    # Run evals on development set and print their loss
+                    _eval_test_set(sess, model, test_buckets)
+                    start = time.time()
+                sys.stdout.flush()
+
+def _get_user_input():
+    """ Get user's input, which will be transformed into encoder input later """
+    print("> ", end="")
+    sys.stdout.flush()
+    return sys.stdin.readline()
+
+def _find_right_bucket(length):
+    """ Find the proper bucket for an encoder input based on its length """
+    return min([b for b in xrange(len(config.BUCKETS))
+                if config.BUCKETS[b][0] >= length])
+
+def _construct_response(output_logits, inv_dec_vocab):
+    """ Construct a response to the user's encoder input.
+    @output_logits: the outputs from sequence to sequence wrapper.
+    output_logits is decoder_size np array, each of dim 1 x DEC_VOCAB
+    
+    This is a greedy decoder - outputs are just argmaxes of output_logits.
+    """
+    print(output_logits[0])
+    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
+    # If there is an EOS symbol in outputs, cut them at that point.
+    if config.EOS_ID in outputs:
+        outputs = outputs[:outputs.index(config.EOS_ID)]
+    # Print out sentence corresponding to outputs.
+    return " ".join([tf.compat.as_str(inv_dec_vocab[output]) for output in outputs])
+
+def chat():
+    """ in test mode, we don't to create the backward path
+    """
+    _, enc_vocab = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
+    inv_dec_vocab, _ = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab.dec'))
+
+    model = ChatBotModel(True, batch_size=1)
+    model.build_graph()
+
+    saver = tf.train.Saver()
+
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        _check_restore_parameters(sess, saver)
+        output_file = open(os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
+        # Decode from standard input.
+        max_length = config.BUCKETS[-1][0]
+        print('Welcome to TensorBro. Say something. Enter to exit. Max length is', max_length)
+        while True:
+            line = _get_user_input()
+            if len(line) > 0 and line[-1] == '\n':
+                line = line[:-1]
+            if line == '':
+                break
+            output_file.write('HUMAN ++++ ' + line + '\n')
+            # Get token-ids for the input sentence.
+            token_ids = data.sentence2id(enc_vocab, str(line))
+            if (len(token_ids) > max_length):
+                print('Max length I can handle is:', max_length)
+                line = _get_user_input()
+                continue
+            # Which bucket does it belong to?
+            bucket_id = _find_right_bucket(len(token_ids))
+            # Get a 1-element batch to feed the sentence to the model.
+            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch([(token_ids, [])], 
+                                                                            bucket_id,
+                                                                            batch_size=1)
+            # Get output logits for the sentence.
+            _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs,
+                                           decoder_masks, bucket_id, True)
+            response = _construct_response(output_logits, inv_dec_vocab)
+            print(response)
+            output_file.write('BOT ++++ ' + response + '\n')
+        output_file.write('=============================================\n')
+        output_file.close()
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', choices={'train', 'chat'},
+                        default='train', help="mode. if not specified, it's in the train mode")
+    args = parser.parse_args()
+
+    if not os.path.isdir(config.PROCESSED_PATH):
+        data.prepare_raw_data()
+        data.process_data()
+    print('Data ready!')
+    # create checkpoints folder if there isn't one already
+    data.make_dir(config.CPT_PATH)
+
+    if args.mode == 'train':
+        train()
+    elif args.mode == 'chat':
+        chat()
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,60 @@
+""" A neural chatbot using sequence to sequence model with
+attentional decoder. 
+
+This is based on Google Translate Tensorflow model 
+https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
+
+Sequence to sequence model by Cho et al.(2014)
+
+Created by Chip Huyen as the starter code for assignment 3,
+class CS 20SI: "TensorFlow for Deep Learning Research"
+cs20si.stanford.edu
+
+This file contains the hyperparameters for the model.
+
+See readme.md for instruction on how to run the starter code.
+"""
+
+# parameters for processing the dataset
+DATA_PATH = 'data/cornell movie-dialogs corpus'
+CONVO_FILE = 'movie_conversations.txt'
+LINE_FILE = 'movie_lines.txt'
+OUTPUT_FILE = 'output_convo.txt'
+PROCESSED_PATH = 'processed'
+CPT_PATH = 'checkpoints'
+
+THRESHOLD = 2
+
+PAD_ID = 0
+UNK_ID = 1
+START_ID = 2
+EOS_ID = 3
+
+TESTSET_SIZE = 25000
+
+# model parameters
+""" Train encoder length distribution:
+[175, 92, 11883, 8387, 10656, 13613, 13480, 12850, 11802, 10165, 
+8973, 7731, 7005, 6073, 5521, 5020, 4530, 4421, 3746, 3474, 3192, 
+2724, 2587, 2413, 2252, 2015, 1816, 1728, 1555, 1392, 1327, 1248, 
+1128, 1084, 1010, 884, 843, 755, 705, 660, 649, 594, 558, 517, 475, 
+426, 444, 388, 349, 337]
+These buckets size seem to work the best
+"""
+# [19530, 17449, 17585, 23444, 22884, 16435, 17085, 18291, 18931]
+# BUCKETS = [(6, 8), (8, 10), (10, 12), (13, 15), (16, 19), (19, 22), (23, 26), (29, 32), (39, 44)]
+
+# [37049, 33519, 30223, 33513, 37371]
+# BUCKETS = [(8, 10), (12, 14), (16, 19), (23, 26), (39, 43)]
+
+BUCKETS = [(8, 10), (12, 14), (16, 19)]
+
+NUM_LAYERS = 3
+HIDDEN_SIZE = 256
+BATCH_SIZE = 64
+
+LR = 0.5
+LR_DECAY_FACTOR = 0.99
+MAX_GRAD_NORM = 5.0
+
+NUM_SAMPLES = 512