karpathy · rana-prajjwal · Mar 8, 2017 · Mar 8, 2017 · Mar 8, 2017 · Mar 8, 2017
diff --git a/#scene_recognition_rana! b/#scene_recognition_rana!
diff --git a/caffemodel/ReadMe.txt b/caffemodel/ReadMe.txt
@@ -0,0 +1 @@
+add .caffemodel file here
diff --git a/caffemodel/ReadMe.txt~ b/caffemodel/ReadMe.txt~
diff --git a/prediction.py b/prediction.py
@@ -0,0 +1,124 @@
+import argparse
+import json
+import time
+import datetime
+import numpy as np
+import code
+import os
+import cPickle as pickle
+import math
+import scipy.io
+import subprocess
+
+from imagernn.solver import Solver
+from imagernn.imagernn_utils import decodeGenerator, eval_split
+
+"""
+This script is used to predict sentences for arbitrary images
+that are located in a folder we call root_folder. It is assumed that
+the root_folder contains:
+- the raw images
+- a file tasks.txt that lists the images you'd like to use
+- a file vgg_feats.mat that contains the CNN features. 
+  You'll need to use the Matlab script I provided and point it at the
+  root folder and its tasks.txt file to save the features.
+
+Then point this script at the folder and at a checkpoint model you'd
+like to evaluate.
+"""
+
+def main(params):
+
+  # load the checkpoint
+  checkpoint_path = params['checkpoint_path']
+  print 'loading checkpoint %s' % (checkpoint_path, )
+  checkpoint = pickle.load(open(checkpoint_path, 'rb'))
+  checkpoint_params = checkpoint['params']
+  dataset = checkpoint_params['dataset']
+  model = checkpoint['model']
+  misc = {}
+  misc['wordtoix'] = checkpoint['wordtoix']
+  ixtoword = checkpoint['ixtoword']
+
+  # output blob which we will dump to JSON for visualizing the results
+  blob = {} 
+  blob['params'] = params
+  blob['checkpoint_params'] = checkpoint_params
+  blob['imgblobs'] = []
+  ### Function from here should repeat
+  dir = "/home/eic/neuraltalk/push_model/video/"
+  n = 0	 
+  # lets run for 120 seconds 
+  while n < 120:
+	if (os.listdir(dir)): 
+		## bash script to extract feautures from the above found image in dir
+		subprocess.call("bash /home/eic/neuraltalk/push_model/Intermediate_call.sh", shell=True)
+		# load the tasks.txt file
+		root_path = params['root_path']
+		img_names = open(os.path.join(root_path, 'tasks.txt'), 'r').read().splitlines()
+
+		# load the features for all images
+		features_path = os.path.join(root_path, 'vgg_feats.mat')
+		features_struct = scipy.io.loadmat(features_path)
+		features = features_struct['feats'] # this is a 4096 x N numpy array of features
+		D,N = features.shape
+
+		# iterate over all images and predict sentences
+		BatchGenerator = decodeGenerator(checkpoint_params)
+		for n in xrange(N):
+			print 'image %d/%d:' % (n, N)
+
+			# encode the image
+			img = {}
+			img['feat'] = features[:, n]
+			img['local_file_path'] =img_names[n]
+
+			# perform the work. heavy lifting happens inside
+			kwparams = { 'beam_size' : params['beam_size'] }
+			Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
+
+			# build up the output
+			img_blob = {}
+			img_blob['img_path'] = img['local_file_path']
+
+			# encode the top prediction
+			top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
+			top_prediction = top_predictions[0] # these are sorted with highest on top
+			candidate = ' '.join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that
+			print 'PRED: (%f) %s' % (top_prediction[0], candidate)
+			img_blob['candidate'] = {'text': candidate, 'logprob': top_prediction[0]}    
+			blob['imgblobs'].append(img_blob)
+
+		# dump result struct to file
+		save_file = os.path.join(root_path, 'result_struct.json')
+		print 'writing predictions to %s...' % (save_file, )
+		json.dump(blob, open(save_file, 'w'))
+
+		# dump output html
+		html = ''
+		for img in blob['imgblobs']:
+			html += '<img src="%s" height="400"><br>' % (img['img_path'], )
+			html += '(%f) %s <br><br>' % (img['candidate']['logprob'], img['candidate']['text'])
+		html_file = os.path.join(root_path, 'result.html')
+		print 'writing html result file to %s...' % (html_file, )
+		open(html_file, 'w').write(html)
+		## open the html file in default browser with text
+		subprocess.call("xdg-open ~/neuraltalk/push_model/Out/result.html", shell=True)
+	else:
+		n += 1
+		print 'waiting for image'
+		time.sleep(1)
+
+
+if __name__ == "__main__":
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument('checkpoint_path', type=str, help='the input checkpoint')
+  parser.add_argument('-r', '--root_path', default='example_images', type=str, help='folder with the images, tasks.txt file, and corresponding vgg_feats.mat file')
+  parser.add_argument('-b', '--beam_size', type=int, default=1, help='beam size in inference. 1 indicates greedy per-word max procedure. Good value is approx 20 or so, and more = better.')
+
+  args = parser.parse_args()
+  params = vars(args) # convert to ordinary dict
+  print 'parsed parameters:'
+  print json.dumps(params, indent = 2)
+  main(params)
diff --git a/push_model/Intermediate_call.sh b/push_model/Intermediate_call.sh
@@ -0,0 +1,2 @@
+python ~/neuraltalk/py_caffe_feat_extract.py --model_path ~/neuraltalk/caffemodel/VGG_ILSVRC_16_layers.caffemodel --model_def_path ~/neuraltalk/python_features/deploy_features.prototxt -i ./video --filter ./video/tasks.txt --WITH_GPU -o ./Out
+rm -r ./video/*
diff --git a/push_model/Intermediate_call.sh~ b/push_model/Intermediate_call.sh~
@@ -0,0 +1,2 @@
+python ~/neuraltalk/py_caffe_feat_extract.py --model_path ~/neuraltalk/caffemodel/VGG_ILSVRC_16_layers.caffemodel --model_def_path ~/neuraltalk/python_features/deploy_features.prototxt -i ./video --filter ./video/tasks.txt --WITH_GPU -o ./Out
+rm -r ./video/*
diff --git a/push_model/Out/out.jpg b/push_model/Out/out.jpg
diff --git a/push_model/Out/result.html b/push_model/Out/result.html
@@ -0,0 +1 @@
+<img src="out.jpg" height="400"><br>(-12.586372) a man is standing in front of a large screen <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br>
diff --git a/push_model/Out/result_struct.json b/push_model/Out/result_struct.json
@@ -0,0 +1 @@
+{"checkpoint_params": {"grad_clip": 10.0, "regc": 2.66e-07, "init_model_from": "", "dataset": "coco", "image_encoding_size": 600, "drop_prob_decoder": 0.5, "word_encoding_size": 600, "max_epochs": 50, "eval_batch_size": 100, "fappend": "lstm", "generator": "lstm", "min_ppl_or_abort": 50.0, "tanhC_version": 1, "eval_max_images": -1, "decay_rate": 0.999, "rnn_feed_once": 0, "hidden_size": 600, "momentum": 0.0, "worker_status_output_directory": "/scail/u/karpathy/rnn-image-describer/status", "rnn_relu_encoders": 0, "learning_rate": 0.000404, "checkpoint_output_directory": "/scail/u/karpathy/rnn-image-describer/cv", "do_grad_check": 0, "word_count_threshold": 5, "batch_size": 64, "write_checkpoint_ppl_threshold": 15.0, "smooth_eps": 1e-08, "solver": "rmsprop", "eval_period": 0.2, "drop_prob_encoder": 0.5}, "imgblobs": [{"img_path": "out.jpg", "candidate": {"text": "a man is standing in front of a large screen", "logprob": -12.586372029703476}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}], "params": {"beam_size": 1, "checkpoint_path": "/home/eic/neuraltalk/data/m.p", "root_path": "./Out"}}
diff --git a/push_model/Out/tasks.txt b/push_model/Out/tasks.txt
@@ -0,0 +1 @@
+out.jpg
diff --git a/push_model/Out/vgg_feats.mat b/push_model/Out/vgg_feats.mat
diff --git a/push_model/get_set.sh b/push_model/get_set.sh
@@ -0,0 +1 @@
+python ~/neuraltalk/prediction.py ~/neuraltalk/data/m.p -r ./Out
diff --git a/push_model/get_set.sh~ b/push_model/get_set.sh~
@@ -0,0 +1 @@
+python ~/neuraltalk/prediction.py ~/neuraltalk/data/m.p -r ./ -n 4
diff --git a/push_model/go.sh b/push_model/go.sh
@@ -0,0 +1,8 @@
+rm -r ./Out/*
+ffmpeg -f video4linux2 -s 640x480 -i /dev/video0 -ss 0:0:1 -frames 1 ~/neuraltalk/push_model/video/out.jpg
+cp tasks.txt ./video/tasks.txt
+cp tasks.txt ./Out/tasks.txt
+cp ./video/out.jpg ./Out/out.jpg
+sleep 5
+rm -r ./video/*
+
diff --git a/push_model/go.sh~ b/push_model/go.sh~
@@ -0,0 +1,7 @@
+ffmpeg -f video4linux2 -s 640x480 -i /dev/video0 -ss 0:0:1 -frames 1 ~/neuraltalk/push_model/video/out.jpg
+cp tasks.txt ./video/tasks.txt
+cp tasks.txt ./Out/tasks.txt
+cp ./video/out.jpg ./Out/out.jpg
+sleep 5
+rm -r ./video/*
+
diff --git a/push_model/open_result.sh b/push_model/open_result.sh
diff --git a/push_model/push.sh~ b/push_model/push.sh~
@@ -0,0 +1,5 @@
+ffmpeg -f video4linux2 -s 640x480 -i /dev/video0 -ss 0:0:2 -frames 1 ~/neuraltalk/push_model/video/out.jpg
+cp task.txt ./video/task.txt
+sleep 10[s]
+rm -r ./video
+
diff --git a/push_model/tasks.txt b/push_model/tasks.txt
@@ -0,0 +1 @@
+out.jpg
diff --git a/push_model/tasks.txt~ b/push_model/tasks.txt~
diff --git a/readme_to_run.txt b/readme_to_run.txt
@@ -0,0 +1,5 @@
+you have download 2 files:
+1.in caffefolder add VGG_ILSVRC_16_layers.caffemodel file
+  https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md
+2.in data folder add m.p file (filename should be changes to this)
+  https://github.com/BVLC/caffe/wiki/Model-Zoo
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python ~/neuraltalk/py_caffe_feat_extract.py --model_path ~/neuraltalk/caffemodel/VGG_ILSVRC_16_layers.caffemodel --model_def_path ~/neuraltalk/python_features/deploy_features.prototxt -i ./video --filter ./video/tasks.txt --WITH_GPU -o ./Out
		rm -r ./video/*
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<img src="out.jpg" height="400"><br>(-12.586372) a man is standing in front of a large screen <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br><img src="out.jpg" height="400"><br>(-8.179808) a bathroom with a toilet and a sink <br><br>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"checkpoint_params": {"grad_clip": 10.0, "regc": 2.66e-07, "init_model_from": "", "dataset": "coco", "image_encoding_size": 600, "drop_prob_decoder": 0.5, "word_encoding_size": 600, "max_epochs": 50, "eval_batch_size": 100, "fappend": "lstm", "generator": "lstm", "min_ppl_or_abort": 50.0, "tanhC_version": 1, "eval_max_images": -1, "decay_rate": 0.999, "rnn_feed_once": 0, "hidden_size": 600, "momentum": 0.0, "worker_status_output_directory": "/scail/u/karpathy/rnn-image-describer/status", "rnn_relu_encoders": 0, "learning_rate": 0.000404, "checkpoint_output_directory": "/scail/u/karpathy/rnn-image-describer/cv", "do_grad_check": 0, "word_count_threshold": 5, "batch_size": 64, "write_checkpoint_ppl_threshold": 15.0, "smooth_eps": 1e-08, "solver": "rmsprop", "eval_period": 0.2, "drop_prob_encoder": 0.5}, "imgblobs": [{"img_path": "out.jpg", "candidate": {"text": "a man is standing in front of a large screen", "logprob": -12.586372029703476}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}, {"img_path": "out.jpg", "candidate": {"text": "a bathroom with a toilet and a sink", "logprob": -8.1798075037485667}}], "params": {"beam_size": 1, "checkpoint_path": "/home/eic/neuraltalk/data/m.p", "root_path": "./Out"}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python ~/neuraltalk/prediction.py ~/neuraltalk/data/m.p -r ./Out