0

blackjack

import gym
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
'''
This code illustrates when given a monte carole policy, how we
evaluate it.

The four figures corresponds to Figure 5.2 in the book:

http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf.

The simulator used is the one provided by OpenAI, 

https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py

The policy used is that the player sticks when 20 ro 21 and hits otherwise.
'''


class BlackJackMonteCaroleAgent(object):
    def __init__(self):
        # ace-10
        dealer_diff_cards_num = 10
        # 12-21, cards under 12 don't count as player can always hit safely
        player_diff_cards_num = 10
        # the player either has usable ACE(used as 11) or not
        # note that this status can be changed during the game
        usable_num = 2
        self.stick = 0
        self.hit = 1
        self.state_value_functions = np.zeros((usable_num, dealer_diff_cards_num, player_diff_cards_num))
        self.state_value_counts = np.zeros((usable_num, dealer_diff_cards_num, player_diff_cards_num))
        # self.avg_state_value_functions = self.state_value_functions/self.state_value_counts,
        # this is what we want to evaluate
        self.avg_state_value_functions = np.zeros((usable_num, dealer_diff_cards_num, player_diff_cards_num))
        # mappings
        self.is_usable_to_index = {True:1, False:0}
        self.cur_num_to_index= {}
        for i in range(12,22):
            self.cur_num_to_index[i] = i - 12

    def take_action(self, ob):
        cur_sum, dealer_num, is_usable = ob
        if cur_sum == 20 or cur_sum == 21:
            action = self.stick
        else:
            action = self.hit
        return action

    def update(self, obs, reward):
        for ob in obs:
            cur_sum, dealer_num, is_usable = ob
            if cur_sum < 12:
                continue
            cur_sum_index = self.cur_num_to_index[cur_sum]
            is_usable_index = self.is_usable_to_index[is_usable]
            dealer_num_index = dealer_num - 1
            self.state_value_counts[is_usable_index][dealer_num_index][cur_sum_index] += 1
            self.state_value_functions[is_usable_index][dealer_num_index][cur_sum_index] += reward

    def summarize(self):
        self.avg_state_value_functions = self.state_value_functions / self.state_value_counts
        return self.avg_state_value_functions


# helper function copied from OpenAI code, used for compute observation at the beginning of the game
def sum_hand(hand):  # Return current hand total
    if usable_ace(hand):
            return sum(hand) + 10
    return sum(hand)


# same as above
def usable_ace(hand):  # Does this hand have a usable ace?
    return 1 in hand and sum(hand) + 10 <= 21


# helper function used to plot
def plot(values):
    x = np.arange(10)
    y = np.arange(10)
    x, y = np.meshgrid(x, y)
    fig = plt.figure(figsize=(8, 8))
    n = 1
    episode_num_titles = {0:"10000 episodes", 1:"50000 episodes"}
    is_usable_titles = {0:'Do Not Have Usable ACE' , 1:'Do Have Usable ACE'}
    for i in range(len(values)):
        for k in range(2):
            ax = fig.add_subplot(2, 2, n, projection='3d')
            ax.set_title(is_usable_titles[k] + ', ' + episode_num_titles[i])
            ax.plot_surface(x, y, values[i][k])
            n += 1
    plt.show()

if __name__ == "__main__":
    env = gym.make('Blackjack-v0')
    agent = BlackJackMonteCaroleAgent()
    episode_nums = [10000, 50000]
    avgs_state_value_functions = []
    for episode_num in episode_nums:
        for i in range(episode_num):
            reward_sum = 0.0
            env.reset()
            # see what the value of the two cards the player already has at beginning
            cur_sum = sum_hand(env.player)
            # construct the first observation
            ob = cur_sum, env.dealer[0], usable_ace(env.player)
            done = False
            this_suite_obs = []
            while not done:
                action = agent.take_action(ob)
                old_ob = ob
                this_suite_obs.append(old_ob)
                ob, reward, done, _ = env.step(action)
                reward_sum += reward
            # update state value function statistic using ob right before done
            agent.update(this_suite_obs, reward_sum)

        avgs_state_value_functions.append(agent.summarize())
    plot(avgs_state_value_functions)




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

octave code

%%%%%%% CS 229 Machine Learning %%%%%%%%%%%
%%%%%%% Programming Assignment 4 %%%%%%%%%%
%%%
%%% Parts of the code (cart and pole dynamics, and the state
%%% discretization) are adapted from code available at the RL repository
%%% http://www-anw.cs.umass.edu/rlr/domains.html
%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% This file controls the pole-balancing simulation. You need to write
% code in places marked "CODE HERE" only.

% Briefly, the main simulation loop in this file calls cart_pole.m for
% simulating the pole dynamics, get_state.m for discretizing the
% otherwise continuous state space in discrete states, and show_cart.m
% for display.

% Some useful parameters are listed below.

% NUM_STATES: Number of states in the discretized state space
% You must assume that states are numbered 1 through NUM_STATES. The
% state numbered NUM_STATES (the last one) is a special state that marks
% the state when the pole has been judged to have fallen (or when the
% cart is out of bounds). However, you should NOT treat this state any
% differently in your code. Any distinctions you need to make between
% states should come automatically from your learning algorithm.

% After each simulation cycle, you are supposed to update the transition
% counts and rewards observed. However, you should not change either
% your value function or the transition probability matrix at each
% cycle.

% Whenever the pole falls, a section of your code below will be
% executed. At this point, you must use the transition counts and reward
% observations that you have gathered to generate a new model for the MDP
% (i.e., transition probabilities and state rewards). After that, you
% must use value iteration to get the optimal value function for this MDP
% model.

% TOLERANCE: Controls the convergence criteria for each value iteration
% run
% In the value iteration, you can assume convergence when the maximum
% absolute change in the value function at any state in an iteration
% becomes lower than TOLERANCE.

% You need to write code that chooses the best action according
% to your current value function, and the current model of the MDP. The
% action must be either 1 or 2 (corresponding to possible directions of
% pushing the cart).

% Finally, we assume that the simulation has converged when
% 'NO_LEARNING_THRESHOLD' consecutive value function computations all
% converged within one value function iteration. Intuitively, it seems
% like there will be little learning after this, so we end the simulation
% here, and say the overall algorithm has converged.

% Learning curves can be generated by calling plot_learning_curve.m (it
% assumes that the learning was just executed, and the array
% time_steps_to_failure that records the time for which the pole was
% balanced before each failure are in memory). num_failures is a variable
% that stores the number of failures (pole drops / cart out of bounds)
% till now.

% Other parameters in the code are described below:

% GAMMA: Discount factor to be used

% The following parameters control the simulation display; you dont
% really need to know about them:

% pause_time: Controls the pause between successive frames of the
% display. Higher values make your simulation slower.
% min_trial_length_to_start_display: Allows you to start the display only
% after the pole has been successfully balanced for at least this many
% trials. Setting this to zero starts the display immediately. Choosing a
% reasonably high value (around 100) can allow you to rush through the
% initial learning quickly, and start the display only after the
% performance is reasonable.

%%%%%%%%%% Simulation parameters %%%%%%%%%%

pause_time = 0.001;
min_trial_length_to_start_display = 0;
display_started=0;

NUM_STATES = 163;

GAMMA=0.995;

TOLERANCE=0.01;

NO_LEARNING_THRESHOLD = 20;

%%%%%%%%%% End parameter list %%%%%%%%%%

% Time cycle of the simulation
time=0;

% These variables perform bookkeeping (how many cycles was the pole
% balanced for before it fell). Useful for plotting learning curves.
time_steps_to_failure=[];
num_failures=0;
time_at_start_of_current_trial=0;

max_failures=500; % You should reach convergence well before this.

% Starting state is (0 0 0 0)
% x, x_dot, theta, theta_dot represents the actual continuous state vector
x = 0.0; x_dot = 0.0; theta = 0.0; theta_dot = 0.0;

% state is the number given to this state - you only need to consider
% this representation of the state
state = get_state(x, x_dot, theta, theta_dot);

if min_trial_length_to_start_display==0 || display_started==1
show_cart(x, x_dot, theta, theta_dot, pause_time);
end

%%% CODE HERE: Perform all your initializations here %%%

% Assume no transitions or rewards have been observed
% Initialize the value function array to small random values (0 to 0.10,
% say)
% Initialize the transition probabilities uniformly (ie, probability of
% transitioning for state x to state y using action a is exactly
% 1/NUM_STATES). Initialize all state rewards to zero.
value_function = rand(1,NUM_STATES) / 10;
% mdp = ones(NUM_STATES, NUM_STATES, 2)/ NUM_STATES;
mdp = zeros(NUM_STATES, NUM_STATES, 2);
rewards = zeros(1, NUM_STATES);
total_count = zeros(NUM_STATES, NUM_STATES, 2);

%for s = 1:NUM_STATES
% if s != NUM_STATES
% rewards(s) = 0;
% else
% rewards(s) = -1;
% endif
%endfor
%%%% END YOUR CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%% CODE HERE (while loop condition) %%%
% This is the criterion to end the simulation
% You should change it to terminate when the previous
% 'NO_LEARNING_THRESHOLD' consecutive value function computations all
% converged within one value function iteration. Intuitively, it seems
% like there will be little learning after this, so end the simulation
% here, and say the overall algorithm has converged.

% while num_failures no_learning_count = 0
while (no_learning_count < NO_LEARNING_THRESHOLD)

%%% CODE HERE: Write code to choose action (1 or 2) %%%

% This action choice algorithm is just for illustration. It may
% convince you that reinforcement learning is nice for control
% problems! Replace it with your code to choose an action that is
% optimal according to the current value function, and the current MDP
% model.
% if (theta<0)
% action=1;
% else
% action=2;
% end

cur_state = get_state(x, x_dot, theta, theta_dot);
value_action_1 = dot(mdp(cur_state,:,1), value_function);
value_action_2 = dot(mdp(cur_state,:,2), value_function);
if value_action_1 >= value_action_2
action = 1;
else
action = 2;
end
%if num_failures<-20
% if (rand(1) < 0.5)
% action=1;
% else
% action=2;
% end
%end

%%% END YOUR CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Get the next state by simulating the dynamics
[x, x_dot, theta, theta_dot] = cart_pole(action, x, x_dot, theta, theta_dot);

% Increment simulation time
time = time + 1;

% Get the state number corresponding to new state vector
new_state = get_state(x, x_dot, theta, theta_dot);

if display_started==1
show_cart(x, x_dot, theta, theta_dot, pause_time);
end

% Reward function to use - do not change this!
if (new_state==NUM_STATES)
R=-1;
else
%R=-abs(theta)/2.0;
R=0;
end

%%% CODE HERE: Perform updates %%%%%%%%%

% A transition from 'state' to 'new_state' has just been made using
% 'action'. The reward observed in 'new_state' (note) is 'R'.
% Write code to update your statistics about the MDP - i.e., the
% information you are storing on the transitions and on the rewards
% observed. Do not change the actual MDP parameters, except when the
% pole falls (the next if block)!

total_count(state, new_state, action) = total_count(state, new_state, action) + 1;
rewards(new_state) = R;
% Recompute MDP model whenever pole falls
% Compute the value function V for the new model
if (new_state==NUM_STATES)

% Update MDP model using the current accumulated statistics about the
% MDP - transitions and rewards.
% Make sure you account for the case when total_count is 0, i.e., a
% state-action pair has never been tried before, or the state has
% never been visited before. In that case, you must not change that
% component (and thus keep it at the initialized uniform distribution).

sum_action = sum(total_count,dim=2);
x = zeros(NUM_STATES, 2);
for s0 = 1:NUM_STATES
for s = 1:NUM_STATES
for a = 1:2
if total_count(s0,s,a) == 0
x(s0,a) += 1/NUM_STATES;
endif
endfor
endfor
endfor
for s0 = 1:NUM_STATES
for s = 1:NUM_STATES
for a = 1:2
if total_count(s0,s,a) > 0
mdp(s0, s, a) = (1-x(s0,a)) * total_count(s0,s,a) / sum_action(s0, a);
endif
endfor
endfor
endfor
% Perform value iteration using the new estimated model for the MDP
% The convergence criterion should be based on TOLERANCE as described
% at the top of the file.
% If it converges within one iteration, you may want to update your
% variable that checks when the whole simulation must end
max_diff_abs = 100;
iter = 0
while (max_diff_abs > TOLERANCE)
max_diff_abs
value_function_old = repmat(value_function,1);
for s = 1:NUM_STATES
value_action_1 = dot(mdp(s,:,1), value_function_old);
value_action_2 = dot(mdp(s,:,2), value_function_old);
max_value_action = max(value_action_1, value_action_2);
value_function(s) = rewards(s) + GAMMA * max_value_action;
endfor
value_function_old;
value_function;
max_diff_abs = abs(max(value_function .- value_function_old));
iter = iter + 1;
endwhile
iter
if (iter == 1)
no_learning_count = no_learning_count + 1
endif

pause(0.2); % You can use this to stop for a while!

end

%%% END YOUR CODE %%%%%%%%%%%%%%%%%%%

% Dont change this code: Controls the simulation, and handles the case
% when the pole fell and the state must be reinitialized
if (new_state == NUM_STATES)
num_failures = num_failures+1
time_steps_to_failure(num_failures) = time - time_at_start_of_current_trial;
time_at_start_of_current_trial = time;

time_steps_to_failure(num_failures)

if (time_steps_to_failure(num_failures)> ...
min_trial_length_to_start_display)
display_started=1;
end

% Reinitialize state
x = -1.1 + rand(1)*2.2
%x=0.0;
x_dot = 0.0; theta = 0.0; theta_dot = 0.0;
state = get_state(x, x_dot, theta, theta_dot);
else
state=new_state;
end
end

time_steps_to_failure
% Plot the learning curve (time balanced vs trial)
plot_learning_curve


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

assignment3

rnn_layers.py

import numpy as np


"""
This file defines layer types that are commonly used for recurrent neural
networks.
"""


def rnn_step_forward(x, prev_h, Wx, Wh, b):
  """
  Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
  activation function.

  The input data has dimension D, the hidden state has dimension H, and we use
  a minibatch size of N.

  Inputs:
  - x: Input data for this timestep, of shape (N, D).
  - prev_h: Hidden state from previous timestep, of shape (N, H)
  - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
  - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
  - b: Biases of shape (H,)

  Returns a tuple of:
  - next_h: Next hidden state, of shape (N, H)
  - cache: Tuple of values needed for the backward pass.
  """
  next_h, cache = None, None
  ##############################################################################
  # TODO: Implement a single forward step for the vanilla RNN. Store the next  #
  # hidden state and any values you need for the backward pass in the next_h   #
  # and cache variables respectively.                                          #
  ##############################################################################
  xWx_plus_hWh = x.dot(Wx) + prev_h.dot(Wh) + b
  next_h = np.tanh(xWx_plus_hWh)
  cache = xWx_plus_hWh, Wx, x, prev_h, Wh
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return next_h, cache


def rnn_step_backward(dnext_h, cache):
  """
  Backward pass for a single timestep of a vanilla RNN.
  
  Inputs:
  - dnext_h: Gradient of loss with respect to next hidden state
  - cache: Cache object from the forward pass
  
  Returns a tuple of:
  - dx: Gradients of input data, of shape (N, D)
  - dprev_h: Gradients of previous hidden state, of shape (N, H)
  - dWx: Gradients of input-to-hidden weights, of shape (N, H)
  - dWh: Gradients of hidden-to-hidden weights, of shape (H, H)
  - db: Gradients of bias vector, of shape (H,)
  """
  dx, dprev_h, dWx, dWh, db = None, None, None, None, None
  ##############################################################################
  # TODO: Implement the backward pass for a single step of a vanilla RNN.      #
  #                                                                            #
  # HINT: For the tanh function, you can compute the local derivative in terms #
  # of the output value from tanh.                                             #
  ##############################################################################
  xWx_plus_hWh, Wx, x, prev_h, Wh = cache
  dtanx = dnext_h * (1/(np.cosh(xWx_plus_hWh)) ** 2)
  db = np.sum(dtanx, axis=0)
  dx = dtanx.dot(Wx.T)
  dWx = x.T.dot(dtanx)
  dWh = prev_h.T.dot(dtanx)
  dprev_h = dtanx.dot(Wh.T)

  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dx, dprev_h, dWx, dWh, db


def rnn_forward(x, h0, Wx, Wh, b):
  """
  Run a vanilla RNN forward on an entire sequence of data. We assume an input
  sequence composed of T vectors, each of dimension D. The RNN uses a hidden
  size of H, and we work over a minibatch containing N sequences. After running
  the RNN forward, we return the hidden states for all timesteps.
  
  Inputs:
  - x: Input data for the entire timeseries, of shape (N, T, D).
  - h0: Initial hidden state, of shape (N, H)
  - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
  - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
  - b: Biases of shape (H,)
  
  Returns a tuple of:
  - h: Hidden states for the entire timeseries, of shape (N, T, H).
  - cache: Values needed in the backward pass
  """
  h, cache = None, None
  ##############################################################################
  # TODO: Implement forward pass for a vanilla RNN running on a sequence of    #
  # input data. You should use the rnn_step_forward function that you defined  #
  # above.                                                                     #
  ##############################################################################
  N, T, D = x.shape
  H = h0.shape[1]
  cache = []
  prev_h = h0
  h = np.zeros((N, T, H))
  for t in range(0, T):
    xt = x[:,t,:]
    ht, cachet = rnn_step_forward(xt, prev_h, Wx, Wh, b)
    h[:,t,:] = ht
    prev_h = ht
    cache.append(cachet)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return h, cache


def rnn_backward(dh, cache):
  """
  Compute the backward pass for a vanilla RNN over an entire sequence of data.
  
  Inputs:
  - dh: Upstream gradients of all hidden states, of shape (N, T, H)
  
  Returns a tuple of:
  - dx: Gradient of inputs, of shape (N, T, D)
  - dh0: Gradient of initial hidden state, of shape (N, H)
  - dWx: Gradient of input-to-hidden weights, of shape (D, H)
  - dWh: Gradient of hidden-to-hidden weights, of shape (H, H)
  - db: Gradient of biases, of shape (H,)
  """
  dx, dh0, dWx, dWh, db = None, None, None, None, None
  ##############################################################################
  # TODO: Implement the backward pass for a vanilla RNN running an entire      #
  # sequence of data. You should use the rnn_step_backward function that you   #
  # defined above.                                                             #
  ##############################################################################
  N, T, H = dh.shape
  xWx_plus_hWh, Wx, x, prev_h, Wh = cache[-1]
  D, H = Wx.shape
  db = np.zeros(H)
  dWx = np.zeros(Wx.shape)
  dWh = np.zeros(Wh.shape)
  dx = np.zeros((N, T, D))
  dprev_ht = np.zeros(prev_h.shape)
  for t in range(T-1, -1, -1):
    cachet = cache[t]
    dht = dh[:,t,:] + dprev_ht
    dxt, dprev_ht, dWxt, dWht, dbt = rnn_step_backward(dht, cachet)
    dx[:,t,:]=dxt
    dWx += dWxt
    dWh += dWht
    db += dbt
  # dh0 is not in the network, not receiving any upstream dh
  dh0 = dprev_ht
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dx, dh0, dWx, dWh, db


def word_embedding_forward(x, W):
  """
  Forward pass for word embeddings. We operate on minibatches of size N where
  each sequence has length T. We assume a vocabulary of V words, assigning each
  to a vector of dimension D.
  
  Inputs:
  - x: Integer array of shape (N, T) giving indices of words. Each element idx
    of x muxt be in the range 0 <= idx < V.
  - W: Weight matrix of shape (V, D) giving word vectors for all words.
  
  Returns a tuple of:
  - out: Array of shape (N, T, D) giving word vectors for all input words.
  - cache: Values needed for the backward pass
  """
  out, cache = None, None
  ##############################################################################
  # TODO: Implement the forward pass for word embeddings.                      #
  #                                                                            #
  # HINT: This should be very simple.                                          #
  ##############################################################################
  out = W[x]
  V, D = W.shape
  cache = x, W, V, D
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return out, cache


def word_embedding_backward(dout, cache):
  """
  Backward pass for word embeddings. We cannot back-propagate into the words
  since they are integers(so dx can be at least 1, too large to be differentiable), 
  so we only return gradient for the word embedding
  matrix.
  
  HINT: Look up the function np.add.at
  
  Inputs:
  - dout: Upstream gradients of shape (N, T, D)
  - cache: Values from the forward pass
  
  Returns:
  - dW: Gradient of word embedding matrix, of shape (V, D).
  """
  dW = None
  ##############################################################################
  # TODO: Implement the backward pass for word embeddings.                     #
  #                                                                            #
  # HINT: Look up the function np.add.at                                       #
  ##############################################################################
  x, W, V, D = cache
  N, T = x.shape
  dW = np.zeros((V, D))
  # this is not optimized, try to use np.add.at later
  for i in range(0, N):
    for j in range(0, T):
      v = x[i,j]
      d_dout = dout[i,j]
      dW[v] += d_dout
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dW


def sigmoid(x):
  """
  A numerically stable version of the logistic sigmoid function.
  """
  pos_mask = (x >= 0)
  neg_mask = (x < 0)
  z = np.zeros_like(x)
  z[pos_mask] = np.exp(-x[pos_mask])
  z[neg_mask] = np.exp(x[neg_mask])
  top = np.ones_like(x)
  top[neg_mask] = z[neg_mask]
  return top / (1 + z)


def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
  """
  Forward pass for a single timestep of an LSTM.
  
  The input data has dimension D, the hidden state has dimension H, and we use
  a minibatch size of N.
  
  Inputs:
  - x: Input data, of shape (N, D)
  - prev_h: Previous hidden state, of shape (N, H)
  - prev_c: previous cell state, of shape (N, H)
  - Wx: Input-to-hidden weights, of shape (D, 4H)
  - Wh: Hidden-to-hidden weights, of shape (H, 4H)
  - b: Biases, of shape (4H,)
  
  Returns a tuple of:
  - next_h: Next hidden state, of shape (N, H)
  - next_c: Next cell state, of shape (N, H)
  - cache: Tuple of values needed for backward pass.
  """
  next_h, next_c, cache = None, None, None
  #############################################################################
  # TODO: Implement the forward pass for a single timestep of an LSTM.        #
  # You may want to use the numerically stable sigmoid implementation above.  #
  #############################################################################
  N, H = prev_h.shape
  a = x.dot(Wx) + prev_h.dot(Wh) + b

  ai = a[:,0:H]
  af = a[:,H:2*H]
  ao = a[:,2*H:3*H]
  ag = a[:,3*H:4*H]

  i = sigmoid(ai)
  f = sigmoid(af)
  o = sigmoid(ao)
  g = np.tanh(ag)

  f_mul_prev_c = f * prev_c
  i_mul_g = i * g
  next_c = f_mul_prev_c + i_mul_g

  tanh_next_c = np.tanh(next_c)
  next_h = o * (tanh_next_c)

  cache = next_h, next_c, tanh_next_c, next_c, i_mul_g, f_mul_prev_c, i, f , o, g, prev_c, prev_h, ai, af, ao, ag, x, Wx, Wh
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  
  return next_h, next_c, cache


def lstm_step_backward(dnext_h, dnext_c, cache):
  """
  Backward pass for a single timestep of an LSTM.
  
  Inputs:
  - dnext_h: Gradients of next hidden state, of shape (N, H)
  - dnext_c: Gradients of next cell state, of shape (N, H)
  - cache: Values from the forward pass
  
  Returns a tuple of:
  - dx: Gradient of input data, of shape (N, D)
  - dprev_h: Gradient of previous hidden state, of shape (N, H)
  - dprev_c: Gradient of previous cell state, of shape (N, H)
  - dWx: Gradient of input-to-hidden weights, of shape (D, 4H)
  - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H)
  - db: Gradient of biases, of shape (4H,)
  """
  dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
  #############################################################################
  # TODO: Implement the backward pass for a single timestep of an LSTM.       #
  #                                                                           #
  # HINT: For sigmoid and tanh you can compute local derivatives in terms of  #
  # the output value from the nonlinearity.                                   #
  #############################################################################
  N,H = dnext_h.shape
  next_h, next_c, tanh_next_c, next_c, i_mul_g, f_mul_prev_c, i, f , o, g, prev_c, prev_h, ai, af, ao, ag, x, Wx, Wh = cache

  # do not forget the part from dnext_h
  dnext_c_all = dnext_c + dnext_h * o * (1/(np.cosh(next_c))**2)

  dai = dnext_c_all * g * (i * (1-i))
  daf = dnext_c_all * prev_c * (f * (1-f))
  dao = dnext_h * tanh_next_c * (o * (1-o))
  dag = dnext_c_all * i * (1/(np.cosh(ag))**2)

  da = np.zeros((N,4*H))
  da[:, 0:H] = dai
  da[:, H:2 * H] = daf
  da[:, 2 * H:3 * H] = dao
  da[:, 3 * H:4 * H] = dag

  dWx = x.T.dot(da)
  dWh = prev_h.T.dot(da)

  db = np.sum(da, axis=0)
  dprev_c = dnext_c_all * f
  dprev_h = da.dot(Wh.T)
  dx = da.dot(Wx.T)

  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################

  return dx, dprev_h, dprev_c, dWx, dWh, db


def lstm_forward(x, h0, Wx, Wh, b):
  """
  Forward pass for an LSTM over an entire sequence of data. We assume an input
  sequence composed of T vectors, each of dimension D. The LSTM uses a hidden
  size of H, and we work over a minibatch containing N sequences. After running
  the LSTM forward, we return the hidden states for all timesteps.
  
  Note that the initial cell state is passed as input, but the initial cell
  state is set to zero. Also note that the cell state is not returned; it is
  an internal variable to the LSTM and is not accessed from outside.
  
  Inputs:
  - x: Input data of shape (N, T, D)
  - h0: Initial hidden state of shape (N, H)
  - Wx: Weights for input-to-hidden connections, of shape (D, 4H)
  - Wh: Weights for hidden-to-hidden connections, of shape (H, 4H)
  - b: Biases of shape (4H,)
  
  Returns a tuple of:
  - h: Hidden states for all timesteps of all sequences, of shape (N, T, H)
  - cache: Values needed for the backward pass.
  """
  h, cache = None, None
  #############################################################################
  # TODO: Implement the forward pass for an LSTM over an entire timeseries.   #
  # You should use the lstm_step_forward function that you just defined.      #
  #############################################################################
  N, T, D = x.shape
  H = h0.shape[1]
  cache = []
  prev_h = h0
  h = np.zeros((N, T, H))
  # inital cell is zeros
  prev_c = np.zeros((N,H))
  for t in range(0, T):
    xt = x[:, t, :]
    next_h, next_c, cachet = lstm_step_forward(xt, prev_h, prev_c, Wx, Wh, b)
    h[:, t, :] = next_h
    prev_h = next_h
    prev_c = next_c
    cache.append(cachet)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################

  return h, cache


def lstm_backward(dh, cache):
  """
  Backward pass for an LSTM over an entire sequence of data.]
  
  Inputs:
  - dh: Upstream gradients of hidden states, of shape (N, T, H)
  - cache: Values from the forward pass
  
  Returns a tuple of:
  - dx: Gradient of input data of shape (N, T, D)
  - dh0: Gradient of initial hidden state of shape (N, H)
  - dWx: Gradient of input-to-hidden weight matrix of shape (D, 4H)
  - dWh: Gradient of hidden-to-hidden weight matrix of shape (H, 4H)
  - db: Gradient of biases, of shape (4H,)
  """
  dx, dh0, dWx, dWh, db = None, None, None, None, None
  #############################################################################
  # TODO: Implement the backward pass for an LSTM over an entire timeseries.  #
  # You should use the lstm_step_backward function that you just defined.     #
  #############################################################################
  N, T, H = dh.shape
  next_h, next_c, tanh_next_c, next_c, i_mul_g, f_mul_prev_c, i, f, o, g, prev_c, prev_h, ai, af, ao, ag, x, Wx, Wh = cache[-1]
  D, H = Wx.shape
  db = np.zeros(H)
  dWx = np.zeros(Wx.shape)
  dWh = np.zeros(Wh.shape)
  dx = np.zeros((N, T, D))
  dprev_ht = np.zeros(prev_h.shape)
  dprev_ct = np.zeros(prev_c.shape)
  for t in range(T - 1, -1, -1):
    cachet = cache[t]
    dht = dh[:, t, :] + dprev_ht
    dxt, dprev_ht, dprev_ct,dWxt, dWht, dbt = lstm_step_backward(dht,dprev_ct,cachet)
    dx[:, t, :] = dxt
    dWx += dWxt
    dWh += dWht
    db += dbt
  # dh0 is not in the network, not receiving any upstream dh
  dh0 = dprev_ht
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  
  return dx, dh0, dWx, dWh, db


def temporal_affine_forward(x, w, b):
  """
  Forward pass for a temporal affine layer. The input is a set of D-dimensional
  vectors arranged into a minibatch of N timeseries, each of length T. We use
  an affine function to transform each of those vectors into a new vector of
  dimension M.

  Inputs:
  - x: Input data of shape (N, T, D)
  - w: Weights of shape (D, M)
  - b: Biases of shape (M,)
  
  Returns a tuple of:
  - out: Output data of shape (N, T, M)
  - cache: Values needed for the backward pass
  """
  N, T, D = x.shape
  M = b.shape[0]
  out = x.reshape(N * T, D).dot(w).reshape(N, T, M) + b
  cache = x, w, b, out
  return out, cache


def temporal_affine_backward(dout, cache):
  """
  Backward pass for temporal affine layer.

  Input:
  - dout: Upstream gradients of shape (N, T, M)
  - cache: Values from forward pass

  Returns a tuple of:
  - dx: Gradient of input, of shape (N, T, D)
  - dw: Gradient of weights, of shape (D, M)
  - db: Gradient of biases, of shape (M,)
  """
  x, w, b, out = cache
  N, T, D = x.shape
  M = b.shape[0]

  dx = dout.reshape(N * T, M).dot(w.T).reshape(N, T, D)
  dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T
  db = dout.sum(axis=(0, 1))

  return dx, dw, db


def temporal_softmax_loss(x, y, mask, verbose=False):
  """
  A temporal version of softmax loss for use in RNNs. We assume that we are
  making predictions over a vocabulary of size V for each timestep of a
  timeseries of length T, over a minibatch of size N. The input x gives scores
  for all vocabulary elements at all timesteps, and y gives the indices of the
  ground-truth element at each timestep. We use a cross-entropy loss at each
  timestep, summing the loss over all timesteps and averaging across the
  minibatch.

  As an additional complication, we may want to ignore the model output at some
  timesteps, since sequences of different length may have been combined into a
  minibatch and padded with NULL tokens. The optional mask argument tells us
  which elements should contribute to the loss.

  Inputs:
  - x: Input scores, of shape (N, T, V)
  - y: Ground-truth indices, of shape (N, T) where each element is in the range
       0 <= y[i, t] < V
  - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
    the scores at x[i, t] should contribute to the loss.

  Returns a tuple of:
  - loss: Scalar giving loss
  - dx: Gradient of loss with respect to scores x.
  """

  N, T, V = x.shape
  
  x_flat = x.reshape(N * T, V)
  y_flat = y.reshape(N * T)
  mask_flat = mask.reshape(N * T)
  
  probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
  probs /= np.sum(probs, axis=1, keepdims=True)
  # y_flat signifies index in V;
  # the padding NULL is masked out, not contributing to the loss
  loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
  dx_flat = probs.copy()
  dx_flat[np.arange(N * T), y_flat] -= 1
  dx_flat /= N
  dx_flat *= mask_flat[:, None]
  
  if verbose: print 'dx_flat: ', dx_flat.shape
  
  dx = dx_flat.reshape(N, T, V)
  
  return loss, dx


rnn.py

import numpy as np

from cs231n.layers import *
from cs231n.rnn_layers import *
from cs231n.layer_utils import *

class CaptioningRNN(object):
  """
  A CaptioningRNN produces captions from image features using a recurrent
  neural network.

  The RNN receives input vectors of size D, has a vocab size of V, works on
  sequences of length T, has an RNN hidden dimension of H, uses word vectors
  of dimension W, and operates on minibatches of size N.

  Note that we don't use any regularization for the CaptioningRNN.
  """
  
  def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128,
               hidden_dim=128, cell_type='rnn', dtype=np.float32):
    """
    Construct a new CaptioningRNN instance.

    Inputs:
    - word_to_idx: A dictionary giving the vocabulary. It contains V entries,
      and maps each string to a unique integer in the range [0, V).
    - input_dim: Dimension D of input image feature vectors.
    - wordvec_dim: Dimension W of word vectors.
    - hidden_dim: Dimension H for the hidden state of the RNN.
    - cell_type: What type of RNN to use; either 'rnn' or 'lstm'.
    - dtype: numpy datatype to use; use float32 for training and float64 for
      numeric gradient checking.
    """
    if cell_type not in {'rnn', 'lstm'}:
      raise ValueError('Invalid cell_type "%s"' % cell_type)
    
    self.cell_type = cell_type
    self.dtype = dtype
    self.word_to_idx = word_to_idx
    self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()}
    self.params = {}
    
    vocab_size = len(word_to_idx)

    self._null = word_to_idx['']
    self._start = word_to_idx.get('', None)
    self._end = word_to_idx.get('', None)
    
    # Initialize word vectors
    self.params['W_embed'] = np.random.randn(vocab_size, wordvec_dim)
    self.params['W_embed'] /= 100
    
    # Initialize CNN -> hidden state projection parameters
    self.params['W_proj'] = np.random.randn(input_dim, hidden_dim)
    self.params['W_proj'] /= np.sqrt(input_dim)
    self.params['b_proj'] = np.zeros(hidden_dim)

    # Initialize parameters for the RNN
    dim_mul = {'lstm': 4, 'rnn': 1}[cell_type]
    self.params['Wx'] = np.random.randn(wordvec_dim, dim_mul * hidden_dim)
    self.params['Wx'] /= np.sqrt(wordvec_dim)
    self.params['Wh'] = np.random.randn(hidden_dim, dim_mul * hidden_dim)
    self.params['Wh'] /= np.sqrt(hidden_dim)
    self.params['b'] = np.zeros(dim_mul * hidden_dim)
    
    # Initialize output to vocab weights
    self.params['W_vocab'] = np.random.randn(hidden_dim, vocab_size)
    self.params['W_vocab'] /= np.sqrt(hidden_dim)
    self.params['b_vocab'] = np.zeros(vocab_size)
      
    # Cast parameters to correct dtype
    for k, v in self.params.iteritems():
      self.params[k] = v.astype(self.dtype)


  def loss(self, features, captions):
    """
    Compute training-time loss for the RNN. We input image features and
    ground-truth captions for those images, and use an RNN (or LSTM) to compute
    loss and gradients on all parameters.
    
    Inputs:
    - features: Input image features, of shape (N, D)
    - captions: Ground-truth captions; an integer array of shape (N, T) where
      each element is in the range 0 <= y[i, t] < V
      
    Returns a tuple of:
    - loss: Scalar loss
    - grads: Dictionary of gradients parallel to self.params
    """
    # Cut captions into two pieces: captions_in has everything but the last word
    # and will be input to the RNN; captions_out has everything but the first
    # word and this is what we will expect the RNN to generate. These are offset
    # by one relative to each other because the RNN should produce word (t+1)
    # after receiving word t. The first element of captions_in will be the START
    # token, and the first element of captions_out will be the first word.
    captions_in = captions[:, :-1]
    captions_out = captions[:, 1:]
    #print "caption", printCaption(captions,self.idx_to_word)
    #print "caption_in", printCaption(captions_in,self.idx_to_word)
    #print "caption_out", printCaption(captions_out,self.idx_to_word)

    # You'll need this 
    mask = (captions_out != self._null)

    # Weight and bias for the affine transform from image features to initial
    # hidden state
    W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
    
    # Word embedding matrix
    W_embed = self.params['W_embed']

    # Input-to-hidden, hidden-to-hidden, and biases for the RNN
    Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

    # Weight and bias for the hidden-to-vocab transformation.
    W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
    
    loss, grads = 0.0, {}

    ############################################################################
    # TODO: Implement the forward and backward passes for the CaptioningRNN.   #
    # In the forward pass you will need to do the following:                   #
    # (1) Use an affine transformation to compute the initial hidden state     #
    #     from the image features. This should produce an array of shape (N, H)#
    # (2) Use a word embedding layer to transform the words in captions_in     #
    #     from indices to vectors, giving an array of shape (N, T, W).         #
    # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to    #
    #     process the sequence of input word vectors and produce hidden state  #
    #     vectors for all timesteps, producing an array of shape (N, T, H).    #
    # (4) Use a (temporal) affine transformation to compute scores over the    #
    #     vocabulary at every timestep using the hidden states, giving an      #
    #     array of shape (N, T, V).                                            #
    # (5) Use (temporal) softmax to compute loss using captions_out, ignoring  #
    #     the points where the output word is  using the mask above.     #
    #                                                                          #
    # In the backward pass you will need to compute the gradient of the loss   #
    # with respect to all model parameters. Use the loss and grads variables   #
    # defined above to store loss and gradients; grads[k] should give the      #
    # gradients for self.params[k].                                            #
    ############################################################################

    # START LOSS COMPUTATION
    # features (N, D)
    #print "features",features.shape

    #print "W_proj",W_proj.shape,"b_proj",b_proj.shape
    # (1)
    # Inputs
    # features: (N, D)
    # W_proj: (D, H)
    # b_proj: (H,)

    # Output
    # init_h: (N, H)
    # cache_inith: features, W_proj, b_proj
    init_h,cache_inith = affine_forward(features, W_proj, b_proj)

    #print "captions_in", captions_in.shape
    #print "W_embed", W_embed.shape
    # (2)
    # Inputs
    # captions_in: (N, T-1), last being cut
    # W_embed: (V, W)

    # Output
    # captions_in_vec: (N, T, W)
    # cache_captions_in_vec: x, W_embed, V, W
    captions_in_vec, cache_captions_in_vec = word_embedding_forward(captions_in, W_embed)

    #print "Wx", Wx.shape
    #print "Wh",Wh.shape
    #print "b",b.shape
    # (3)
    # Inputs
    # x: (N, T, W)
    # h0: init_h (N, H)
    # Wx: (W, H)
    # Wh: (H, H)
    # b: (H,)

    # Outputs
    # hs: (N, T, H)
    # cache_hs: list of dx, dprev_h, dWx, dWh, db
    if self.cell_type == "rnn":
      hs, cache_hs = rnn_forward(captions_in_vec, init_h, Wx, Wh, b)
    if self.cell_type == "lstm":
      hs, cache_hs = lstm_forward(captions_in_vec, init_h, Wx, Wh, b)

    #print "hs",hs.shape
    #print "W_vocab",W_vocab.shape
    #print "b_vocab",b_vocab.shape
    # (4)
    # Inputs
    # hs: (N, T, H)
    # W_vocab: (H, V)
    # b_vocab: (V)

    # Output
    # vocab_scores: (N ,T ,V)
    # vocab_scores_cache: hs, W_vocab, b_vocab, vocab_scores
    vocab_scores, vocab_scores_cache = temporal_affine_forward(hs, W_vocab, b_vocab)

    # (5)
    # Inputs
    # vocab_scores: (N ,T ,V)
    # captions_out: (N ,T ,V)

    # Output
    # loss: float
    # dout: (N ,T ,V)
    loss, dout = temporal_softmax_loss(vocab_scores, captions_out, mask)

    # START GRADIENT COMPUTATION
    # (4)
    dhs, dW_vocab, db_vocab = temporal_affine_backward(dout, vocab_scores_cache)

    # (3)
    if self.cell_type == "rnn":
      dcaptions_in_vec, dinit_h, dWx, dWh, db = rnn_backward(dhs, cache_hs)
    if self.cell_type == "lstm":
      dcaptions_in_vec, dinit_h, dWx, dWh, db = lstm_backward(dhs, cache_hs)

    # (2)
    dW_embed = word_embedding_backward(dcaptions_in_vec, cache_captions_in_vec)

    # (1)
    dfeatures, dW_proj, db_proj = affine_backward(dinit_h, cache_inith)

    grads['W_vocab'] = dW_vocab
    grads['b_vocab'] = db_vocab

    grads['Wx'] = dWx
    grads['Wh'] = dWh
    grads['b'] = db

    grads['W_embed'] = dW_embed
    grads['W_proj'] = dW_proj
    grads['b_proj'] = db_proj

    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    return loss, grads


  def sample(self, features, max_length=30):
    """
    Run a test-time forward pass for the model, sampling captions for input
    feature vectors.

    At each timestep, we embed the current word, pass it and the previous hidden
    state to the RNN to get the next hidden state, use the hidden state to get
    scores for all vocab words, and choose the word with the highest score as
    the next word. The initial hidden state is computed by applying an affine
    transform to the input image features, and the initial word is the 
    token.

    For LSTMs you will also have to keep track of the cell state; in that case
    the initial cell state should be zero.

    Inputs:
    - features: Array of input image features of shape (N, D).
    - max_length: Maximum length T of generated captions.

    Returns:
    - captions: Array of shape (N, max_length) giving sampled captions,
      where each element is an integer in the range [0, V). The first element
      of captions should be the first sampled word, not the  token.
    """
    N = features.shape[0]
    captions = self._null * np.ones((N, max_length), dtype=np.int32)
    print "N",N
    print "max_length",max_length
    print "captions",captions

    # set first word 'START'
    for i in range(0, N):
        captions[i, 0]= self._start

    # Unpack parameters
    W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
    W_embed = self.params['W_embed']
    Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
    W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
    
    ###########################################################################
    # TODO: Implement test-time sampling for the model. You will need to      #
    # initialize the hidden state of the RNN by applying the learned affine   #
    # transform to the input image features. The first word that you feed to  #
    # the RNN should be the  token; its value is stored in the         #
    # variable self._start. At each timestep you will need to do to:          #
    # (1) Embed the previous word using the learned word embeddings           #
    # (2) Make an RNN step using the previous hidden state and the embedded   #
    #     current word to get the next hidden state.                          #
    # (3) Apply the learned affine transformation to the next hidden state to #
    #     get scores for all words in the vocabulary                          #
    # (4) Select the word with the highest score as the next word, writing it #
    #     to the appropriate slot in the captions variable                    #
    #                                                                         #
    # For simplicity, you do not need to stop generating after an  token #
    # is sampled, but you can if you want to.                                 #
    #                                                                         #
    # HINT: You will not be able to use the rnn_forward or lstm_forward       #
    # functions; you'll need to call rnn_step_forward or lstm_step_forward in #
    # a loop.                                                                 #
    ###########################################################################
    # image features to initial hidden state

    init_h, cache_inith = affine_forward(features, W_proj, b_proj)
    captions_in_vec, cache_captions_in_vec = word_embedding_forward(captions, W_embed)
    prev_h = init_h
    prev_c = np.zeros(prev_h.shape)
    for t in range (0, max_length-1):
      if self.cell_type == "rnn":
        h, _ = rnn_step_forward(captions_in_vec[:,t,:], prev_h, Wx, Wh, b)
      if self.cell_type == "lstm":
        h, next_c, _ = lstm_step_forward(captions_in_vec[:,t,:], prev_h, prev_c, Wx, Wh, b)
      N, D = h.shape
      h = h.reshape((N, 1, D))
      vocab_scores, _ = temporal_affine_forward(h, W_vocab, b_vocab)
      highest_prob_vocab_scores = np.argmax(vocab_scores, axis=2)
      captions[:, t+1] = highest_prob_vocab_scores.reshape(N)
      prev_h = h.reshape((N,D))
      if self.cell_type == "lstm":
        prev_c = next_c
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    return captions

Saliency map

def compute_saliency_maps(X, y, model):
  """
  Compute a class saliency map using the model for images X and labels y.
  
  Input:
  - X: Input images, of shape (N, 3, H, W)
  - y: Labels for X, of shape (N,)
  - model: A PretrainedCNN that will be used to compute the saliency map.
  
  Returns:
  - saliency: An array of shape (N, H, W) giving the saliency maps for the input
    images.
  """
  saliency = None
  ##############################################################################
  # TODO: Implement this function. You should use the forward and backward     #
  # methods of the PretrainedCNN class, and compute gradients with respect to  #
  # the unnormalized class score of the ground-truth classes in y.             #
  ##############################################################################
  pretrainedCNN = model
  unnormalized_class_score, cache = pretrainedCNN.forward(X)
  dout = np.zeros(unnormalized_class_score.shape)
  dout[np.arange(dout.shape[0]),y] = 1
  dX, grads = pretrainedCNN.backward(dout, cache)
  saliency = np.max(dX, axis=1)
  ##############################################################################
  #                             END OF YOUR CODE                               #
  ##############################################################################
  return saliency
def make_fooling_image(X, target_y, model):
  """
  Generate a fooling image that is close to X, but that the model classifies
  as target_y.
  
  Inputs:
  - X: Input image, of shape (1, 3, 64, 64)
  - target_y: An integer in the range [0, 100)
  - model: A PretrainedCNN
  
  Returns:
  - X_fooling: An image that is close to X, but that is classifed as target_y
    by the model.
  """
  X_fooling = X.copy()
  ##############################################################################
  # TODO: Generate a fooling image X_fooling that the model will classify as   #
  # the class target_y. Use gradient ascent on the target class score, using   #
  # the model.forward method to compute scores and the model.backward method   #
  # to compute image gradients.                                                #
  #                                                                            #
  # HINT: For most examples, you should be able to generate a fooling image    #
  # in fewer than 100 iterations of gradient ascent.                           #
  ##############################################################################
  print 'y',y
  print 'target_y',target_y
  X1 = X.copy()
  for i in range(0,100):
    print "i =",i,
    unnormalized_class_score, cache = model.forward(X1)
    print "cur score =",unnormalized_class_score[0][target_y]," max =",np.max(unnormalized_class_score)
    # increasing the target_score while decreasing the rest 
    dout = np.full(unnormalized_class_score.shape,-1)
    dout[np.arange(dout.shape[0]),target_y] = 1
    
    dX, grads = model.backward(dout, cache)
    # do not forget the learning rate!
    X1 += 1000 * dX
    if np.argmax(unnormalized_class_score,axis=1) == target_y:
        print "fooling success at i =",i
        X_fooling = X1
        break
  ##############################################################################
  #                             END OF YOUR CODE                               #
  ##############################################################################
  return X_fooling

class visulization

def create_class_visualization(target_y, model, **kwargs):
  """
  Perform optimization over the image to generate class visualizations.
  
  Inputs:
  - target_y: Integer in the range [0, 100) giving the target class
  - model: A PretrainedCNN that will be used for generation
  
  Keyword arguments:
  - learning_rate: Floating point number giving the learning rate
  - blur_every: An integer; how often to blur the image as a regularizer
  - l2_reg: Floating point number giving L2 regularization strength on the image;
    this is lambda in the equation above.
  - max_jitter: How much random jitter to add to the image as regularization
  - num_iterations: How many iterations to run for
  - show_every: How often to show the image
  """
  
  learning_rate = kwargs.pop('learning_rate', 10000)
  blur_every = kwargs.pop('blur_every', 1)
  l2_reg = kwargs.pop('l2_reg', 1e-6)
  max_jitter = kwargs.pop('max_jitter', 4)
  num_iterations = kwargs.pop('num_iterations', 100)
  show_every = kwargs.pop('show_every', 25)
  
  X = np.random.randn(1, 3, 64, 64)
  for t in xrange(num_iterations):
    # As a regularizer, add random jitter to the image
    ox, oy = np.random.randint(-max_jitter, max_jitter+1, 2)
    X = np.roll(np.roll(X, ox, -1), oy, -2)

    dX = None
    ############################################################################
    # TODO: Compute the image gradient dX of the image with respect to the     #
    # target_y class score. This should be similar to the fooling images. Also #
    # add L2 regularization to dX and update the image X using the image       #
    # gradient and the learning rate.                                          #
    ############################################################################
    unnormalized_class_score, cache = model.forward(X)
    dout = np.full(unnormalized_class_score.shape,0)
    dout[np.arange(dout.shape[0]),target_y] = 1
    dX, grads = model.backward(dout, cache)
    dX += 2 * l2_reg * X
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # Undo the jitter
    X = np.roll(np.roll(X, -ox, -1), -oy, -2)
    
    # As a regularizer, clip the image
    X = np.clip(X, -data['mean_image'], 255.0 - data['mean_image'])
    
    # As a regularizer, periodically blur the image
    if t % blur_every == 0:
      X = blur_image(X)
    
    # Periodically show the image
    if t % show_every == 0:
      plt.imshow(deprocess_image(X, data['mean_image']))
      plt.gcf().set_size_inches(3, 3)
      plt.axis('off')
      plt.show()
  return X

Feature Inversion
shallow

def invert_features(target_feats, layer, model, **kwargs):
  """
  Perform feature inversion in the style of Mahendran and Vedaldi 2015, using
  L2 regularization and periodic blurring.
  
  Inputs:
  - target_feats: Image features of the target image, of shape (1, C, H, W);
    we will try to generate an image that matches these features
  - layer: The index of the layer from which the features were extracted
  - model: A PretrainedCNN that was used to extract features
  
  Keyword arguments:
  - learning_rate: The learning rate to use for gradient descent
  - num_iterations: The number of iterations to use for gradient descent
  - l2_reg: The strength of L2 regularization to use; this is lambda in the
    equation above.
  - blur_every: How often to blur the image as implicit regularization; set
    to 0 to disable blurring.
  - show_every: How often to show the generated image; set to 0 to disable
    showing intermediate reuslts.
    
  Returns:
  - X: Generated image of shape (1, 3, 64, 64) that matches the target features.
  """
  learning_rate = kwargs.pop('learning_rate', 10000)
  num_iterations = kwargs.pop('num_iterations', 500)
  l2_reg = kwargs.pop('l2_reg', 1e-7)
  blur_every = kwargs.pop('blur_every', 1)
  show_every = kwargs.pop('show_every', 50)
  
  X = np.random.randn(1, 3, 64, 64)
  for t in xrange(num_iterations):
    ############################################################################
    # TODO: Compute the image gradient dX of the reconstruction loss with      #
    # respect to the image. You should include L2 regularization penalizing    #
    # large pixel values in the generated image using the l2_reg parameter;    #
    # then update the generated image using the learning_rate from above.      #
    ############################################################################
    out, cache = model.forward(X, end=layer)
    dout = 2*(target_feats - out)
    dX, _ = model.backward(dout, cache)
    dX += 2 * l2_reg * X
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # As a regularizer, clip the image
    X = np.clip(X, -data['mean_image'], 255.0 - data['mean_image'])
    
    # As a regularizer, periodically blur the image
    if (blur_every > 0) and t % blur_every == 0:
      X = blur_image(X)

    if (show_every > 0) and (t % show_every == 0 or t + 1 == num_iterations):
      plt.imshow(deprocess_image(X, data['mean_image']))
      plt.gcf().set_size_inches(3, 3)
      plt.axis('off')
      plt.title('t = %d' % t)
      plt.show()

deep

def invert_features(target_feats, layer, model, **kwargs):
  """
  Perform feature inversion in the style of Mahendran and Vedaldi 2015, using
  L2 regularization and periodic blurring.
  
  Inputs:
  - target_feats: Image features of the target image, of shape (1, C, H, W);
    we will try to generate an image that matches these features
  - layer: The index of the layer from which the features were extracted
  - model: A PretrainedCNN that was used to extract features
  
  Keyword arguments:
  - learning_rate: The learning rate to use for gradient descent
  - num_iterations: The number of iterations to use for gradient descent
  - l2_reg: The strength of L2 regularization to use; this is lambda in the
    equation above.
  - blur_every: How often to blur the image as implicit regularization; set
    to 0 to disable blurring.
  - show_every: How often to show the generated image; set to 0 to disable
    showing intermediate reuslts.
    
  Returns:
  - X: Generated image of shape (1, 3, 64, 64) that matches the target features.
  """
  learning_rate = kwargs.pop('learning_rate', 10000)
  num_iterations = kwargs.pop('num_iterations', 500)
  l2_reg = kwargs.pop('l2_reg', 1e-7)
  blur_every = kwargs.pop('blur_every', 1)
  show_every = kwargs.pop('show_every', 50)
  
  X = np.random.randn(1, 3, 64, 64)
  for t in xrange(num_iterations):
    ############################################################################
    # TODO: Compute the image gradient dX of the reconstruction loss with      #
    # respect to the image. You should include L2 regularization penalizing    #
    # large pixel values in the generated image using the l2_reg parameter;    #
    # then update the generated image using the learning_rate from above.      #
    ############################################################################
    out, cache = model.forward(X, end=layer)
    dout = 2*(target_feats - out)
    dX, _ = model.backward(dout, cache)
    dX += 2 * l2_reg * X
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # As a regularizer, clip the image
    X = np.clip(X, -data['mean_image'], 255.0 - data['mean_image'])
    
    # As a regularizer, periodically blur the image
    if (blur_every > 0) and t % blur_every == 0:
      X = blur_image(X)

    if (show_every > 0) and (t % show_every == 0 or t + 1 == num_iterations):
      plt.imshow(deprocess_image(X, data['mean_image']))
      plt.gcf().set_size_inches(3, 3)
      plt.axis('off')
      plt.title('t = %d' % t)
      plt.show()

deep dream

def deepdream(X, layer, model, **kwargs):
  """
  Generate a DeepDream image.
  
  Inputs:
  - X: Starting image, of shape (1, 3, H, W)
  - layer: Index of layer at which to dream
  - model: A PretrainedCNN object
  
  Keyword arguments:
  - learning_rate: How much to update the image at each iteration
  - max_jitter: Maximum number of pixels for jitter regularization
  - num_iterations: How many iterations to run for
  - show_every: How often to show the generated image
  """
  
  X = X.copy()
  
  learning_rate = kwargs.pop('learning_rate', 5.0)
  max_jitter = kwargs.pop('max_jitter', 16)
  num_iterations = kwargs.pop('num_iterations', 100)
  show_every = kwargs.pop('show_every', 25)
  
  for t in xrange(num_iterations):
    # As a regularizer, add random jitter to the image
    ox, oy = np.random.randint(-max_jitter, max_jitter+1, 2)
    X = np.roll(np.roll(X, ox, -1), oy, -2)

    dX = None
    ############################################################################
    # TODO: Compute the image gradient dX using the DeepDream method. You'll   #
    # need to use the forward and backward methods of the model object to      #
    # extract activations and set gradients for the chosen layer. After        #
    # computing the image gradient dX, you should use the learning rate to     #
    # update the image X.                                                      #
    ############################################################################
    out, cache = model.forward(X, end=layer)
    dout = out
    dX, _ = model.backward(dout, cache)
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # Undo the jitter
    X = np.roll(np.roll(X, -ox, -1), -oy, -2)
    
    # As a regularizer, clip the image
    mean_pixel = data['mean_image'].mean(axis=(1, 2), keepdims=True)
    X = np.clip(X, -mean_pixel, 255.0 - mean_pixel)
    
    # Periodically show the image
    if t == 0 or (t + 1) % show_every == 0:
      img = deprocess_image(X, data['mean_image'], mean='pixel')
      plt.imshow(img)
      plt.title('t = %d' % (t + 1))
      plt.gcf().set_size_inches(8, 8)
      plt.axis('off')
      plt.show()
  return X




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

assignment2

Q1
layers.py

import numpy as np


def affine_forward(x, w, b):
    """
  Computes the forward pass for an affine (fully-connected) layer.

  The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
  examples, where each example x[i] has shape (d_1, ..., d_k). We will
  reshape each input into a vector of dimension D = d_1 * ... * d_k, and
  then transform it to an output vector of dimension M.

  Inputs:
  - x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
  - w: A numpy array of weights, of shape (D, M)
  - b: A numpy array of biases, of shape (M,)

  Returns a tuple of:
  - out: output, of shape (N, M)
  - cache: (x, w, b)
  """
    out = None
    #############################################################################
    # TODO: Implement the affine forward pass. Store the result in out. You     #
    # will need to reshape the input into rows.                                 #
    #############################################################################
    D = np.product(x.shape[1:])
    N = x.shape[0]
    x1 = x.reshape((N, D))

    out = np.dot(x1, w) + b
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    cache = (x, w, b)
    return out, cache


def affine_backward(dout, cache):
    """
  Computes the backward pass for an affine layer.

  Inputs:
  - dout: Upstream derivative, of shape (N, M)
  - cache: Tuple of:
    - x: Input data, of shape (N, d_1, ... d_k)
    - w: Weights, of shape (D, M)

  Returns a tuple of:
  - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
  - dw: Gradient with respect to w, of shape (D, M)
  - db: Gradient with respect to b, of shape (M,)
  """
    x, w, b = cache
    dx, dw, db = None, None, None
    #############################################################################
    # TODO: Implement the affine backward pass.                                 #
    #############################################################################
    N = x.shape[0]
    db = np.sum(dout, axis=0)

    D = np.product(x.shape[1:])

    x1 = x.reshape((N, D))
    dw = np.dot(x1.T, dout)

    M = w.shape[1]
    D_expanded = x.shape[1:]
    w_new_shape = D_expanded + (M,)
    w1 = w.reshape(w_new_shape)
    dx = np.dot(dout, w.T)
    dx = dx.reshape(x.shape)
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    return dx, dw, db


def relu_forward(x):
    """
  Computes the forward pass for a layer of rectified linear units (ReLUs).

  Input:
  - x: Inputs, of any shape

  Returns a tuple of:
  - out: Output, of the same shape as x
  - cache: x
  """
    out = None
    #############################################################################
    # TODO: Implement the ReLU forward pass.                                    #
    #############################################################################
    out = np.copy(x)
    out[out <= 0] = 0
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    cache = x
    return out, cache


def relu_backward(dout, cache):
    """
  Computes the backward pass for a layer of rectified linear units (ReLUs).

  Input:
  - dout: Upstream derivatives, of any shape
  - cache: Input x, of same shape as dout

  Returns:
  - dx: Gradient with respect to x
  """
    dx, x = None, cache
    #############################################################################
    # TODO: Implement the ReLU backward pass.                                   #
    #############################################################################
    x[x <= 0] = 0
    x[x > 0] = 1
    dx = dout * x
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    return dx


def batchnorm_forward(x, gamma, beta, bn_param):
    """
  Forward pass for batch normalization.

  During training the sample mean and (uncorrected) sample variance are
  computed from minibatch statistics and used to normalize the incoming data.
  During training we also keep an exponentially decaying running mean of the mean
  and variance of each feature, and these averages are used to normalize data
  at test-time.

  At each timestep we update the running averages for mean and variance using
  an exponential decay based on the momentum parameter:

  running_mean = momentum * running_mean + (1 - momentum) * sample_mean
  running_var = momentum * running_var + (1 - momentum) * sample_var

  Note that the batch normalization paper suggests a different test-time
  behavior: they compute sample mean and variance for each feature using a
  large number of training images rather than using a running average. For
  this implementation we have chosen to use running averages instead since
  they do not require an additional estimation step; the torch7 implementation
  of batch normalization also uses running averages.

  Input:
  - x: Data of shape (N, D)
  - gamma: Scale parameter of shape (D,)
  - beta: Shift paremeter of shape (D,)
  - bn_param: Dictionary with the following keys:
    - mode: 'train' or 'test'; required
    - eps: Constant for numeric stability
    - momentum: Constant for running mean / variance.
    - running_mean: Array of shape (D,) giving running mean of features
    - running_var Array of shape (D,) giving running variance of features

  Returns a tuple of:
  - out: of shape (N, D)
  - cache: A tuple of values needed in the backward pass
  """
    mode = bn_param['mode']
    eps = bn_param.get('eps', 1e-5)
    momentum = bn_param.get('momentum', 0.9)

    N, D = x.shape
    running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
    running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))

    out, cache = None, None
    if mode == 'train':
        #############################################################################
        # TODO: Implement the training-time forward pass for batch normalization.   #
        # Use minibatch statistics to compute the mean and variance, use these      #
        # statistics to normalize the incoming data, and scale and shift the        #
        # normalized data using gamma and beta.                                     #
        #                                                                           #
        # You should store the output in the variable out. Any intermediates that   #
        # you need for the backward pass should be stored in the cache variable.    #
        #                                                                           #
        # You should also use your computed sample mean and variance together with  #
        # the momentum variable to update the running mean and running variance,    #
        # storing your result in the running_mean and running_var variables.        #
        #############################################################################
        u = np.mean(x, axis=0)
        numerator = x - u
        v = np.var(x, axis=0)
        v_plus_eps = v + eps
        denominator = np.sqrt(v_plus_eps)
        inv_denominator = 1. / denominator
        xhat = numerator * inv_denominator
        gamma_mul_xhat = gamma * xhat
        y = gamma_mul_xhat + beta
        out = y
        cache = x, u, numerator, v, v_plus_eps, denominator, inv_denominator, xhat, gamma_mul_xhat, y, gamma, beta

        running_mean = momentum * running_mean + (1 - momentum) * u
        running_var = momentum * running_var + (1 - momentum) * v
        bn_param['running_mean'] = running_mean
        bn_param['running_var'] = running_var
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
    elif mode == 'test':
        #############################################################################
        # TODO: Implement the test-time forward pass for batch normalization. Use   #
        # the running mean and variance to normalize the incoming data, then scale  #
        # and shift the normalized data using gamma and beta. Store the result in   #
        # the out variable.                                                         #
        #############################################################################
        u = bn_param['running_mean']
        numerator = x - u
        v = bn_param['running_var']
        v_plus_eps = v + eps
        denominator = np.sqrt(v_plus_eps)
        inv_denominator = 1. / denominator
        xhat = numerator * inv_denominator
        gamma_mul_xhat = gamma * xhat
        y = gamma_mul_xhat + beta
        out = y
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
    else:
        raise ValueError('Invalid forward batchnorm mode "%s"' % mode)

    # Store the updated running means back into bn_param
    bn_param['running_mean'] = running_mean
    bn_param['running_var'] = running_var

    return out, cache


def batchnorm_backward(dout, cache):
    """
  Backward pass for batch normalization.

  For this implementation, you should write out a computation graph for
  batch normalization on paper and propagate gradients backward through
  intermediate nodes.

  Inputs:
  - dout: Upstream derivatives, of shape (N, D)
  - cache: Variable of intermediates from batchnorm_forward.

  Returns a tuple of:
  - dx: Gradient with respect to inputs x, of shape (N, D)
  - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
  - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
  """
    dx, dgamma, dbeta = None, None, None
    x, u, numerator, v, v_plus_eps, denominator, inv_denominator, xhat, gamma_mul_xhat, y, gamma, beta = cache

    #############################################################################
    # TODO: Implement the backward pass for batch normalization. Store the      #
    # results in the dx, dgamma, and dbeta variables.                           #
    #############################################################################
    dx = np.zeros(dout.shape)
    N = dout.shape[0]
    dbeta = np.sum(dout, axis=0)
    dgamma = np.sum(dout * xhat, axis=0)

    dxhat = dout * gamma

    dinv_den = np.sum(dxhat * numerator, axis=0)
    dsqrt_v_plus_eps = dinv_den * (-1) * (denominator ** (-2))
    dv_plus_eps = dsqrt_v_plus_eps * 0.5 * (v_plus_eps ** (-0.5))
    dv = 2. / N * (x - u) * dv_plus_eps
    dx += dv
    du1 = -np.sum(dv, axis=0)
    dux = np.full(dout.shape, 1. / N)
    du1_x = du1 * dux
    dx += du1_x

    dnumerator = dxhat * inv_denominator
    dx += dnumerator
    du2 = -np.sum(dnumerator, axis=0)
    du2_x = du2 * dux
    dx += du2_x

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return dx, dgamma, dbeta


def dropout_forward(x, dropout_param):
    """
  Performs the forward pass for (inverted) dropout.

  Inputs:
  - x: Input data, of any shape
  - dropout_param: A dictionary with the following keys:
    - p: Dropout parameter. We drop each neuron output with probability p.
    - mode: 'test' or 'train'. If the mode is train, then perform dropout;
      if the mode is test, then just return the input.
    - seed: Seed for the random number generator. Passing seed makes this
      function deterministic, which is needed for gradient checking but not in
      real networks.

  Outputs:
  - out: Array of the same shape as x.
  - cache: A tuple (dropout_param, mask). In training mode, mask is the dropout
    mask that was used to multiply the input; in test mode, mask is None.
  """
    p, mode = dropout_param['p'], dropout_param['mode']
    if 'seed' in dropout_param:
        np.random.seed(dropout_param['seed'])

    mask = None
    out = None

    mask = np.random.random(x.shape)
    mask[mask < p] = 0
    mask[mask >= p] = 1

    # mask=(np.random.random(x.shape) > p) / (1 - p)


    if mode == 'train':
        ###########################################################################
        # TODO: Implement the training phase forward pass for inverted dropout.   #
        # Store the dropout mask in the mask variable.                            #
        ###########################################################################
        out = x * mask
        ###########################################################################
        #                            END OF YOUR CODE                             #
        ###########################################################################
    elif mode == 'test':
        ###########################################################################
        # TODO: Implement the test phase forward pass for inverted dropout.       #
        ###########################################################################
        out = x * (1 - p)
        # out = x
        ###########################################################################
        #                            END OF YOUR CODE                             #
        ###########################################################################

    cache = (dropout_param, mask)
    out = out.astype(x.dtype, copy=False)

    return out, cache


def dropout_backward(dout, cache):
    """
  Perform the backward pass for (inverted) dropout.

  Inputs:
  - dout: Upstream derivatives, of any shape
  - cache: (dropout_param, mask) from dropout_forward.
  """
    dropout_param, mask = cache
    mode = dropout_param['mode']
    dx = None
    if mode == 'train':
        ###########################################################################
        # TODO: Implement the training phase backward pass for inverted dropout.  #
        ###########################################################################
        dx = np.ones(mask.shape) * mask
        dx *= dout
        ###########################################################################
        #                            END OF YOUR CODE                             #
        ###########################################################################
    elif mode == 'test':
        dx = dout
    return dx


def conv_forward_naive(x, w, b, conv_param):
    """
  A naive implementation of the forward pass for a convolutional layer.

  The input consists of N data points, each with C channels, height H and width
  W. We convolve each input with F different filters, where each filter spans
  all C channels and has height HH and width HH.

  Input:
  - x: Input data of shape (N, C, H, W)
  - w: Filter weights of shape (F, C, HH, WW)
  - b: Biases, of shape (F,)
  - conv_param: A dictionary with the following keys:
    - 'stride': The number of pixels between adjacent receptive fields in the
      horizontal and vertical directions.
    - 'pad': The number of pixels that will be used to zero-pad the input.

  Returns a tuple of:
  - out: Output data, of shape (N, F, H', W') where H' and W' are given by
    H' = 1 + (H + 2 * pad - HH) / stride
    W' = 1 + (W + 2 * pad - WW) / stride
  - cache: (x, w, b, conv_param)
  """
    out = None
    #############################################################################
    # TODO: Implement the convolutional forward pass.                           #
    # Hint: you can use the function np.pad for padding.                        #
    #############################################################################
    pad = conv_param['pad']
    stride = conv_param['stride']
    N, C, H, W = x.shape
    F, C, HH, WW = w.shape
    x = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant', constant_values=0)
    WOUT = (W - WW + 2 * pad) / stride + 1
    HOUT = (H - HH + 2 * pad) / stride + 1
    t = np.zeros((N, WW * HH * C, WOUT * HOUT))
    for i in range(0, N):
        x_i = x[i, :, :, :]
        # for each image
        l = 0
        for j in range(0, HOUT):
            for k in range(0, WOUT):
                # strech out filter into a col vector
                slide_filter = x_i[:, j * stride:j * stride + HH, k * stride:k * stride + WW]
                slide_filter_col = slide_filter.reshape((WW * HH * C))
                # there are WOUT * HOUT such col vector
                t[i, :, l] = slide_filter_col
                l += 1
    w_reshape = w.reshape((F, C * HH * WW))
    out = np.zeros((N, F, WOUT * HOUT))
    for i in range(0, N):
        out[i] = w_reshape.dot(t[i]) + b.reshape((F, 1))
    out = out.reshape((N, F, HOUT, WOUT))
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    cache = (x, w, b, conv_param)
    return out, cache


def conv_backward_naive(dout, cache):
    """
  A naive implementation of the backward pass for a convolutional layer.

  Inputs:
  - dout: Upstream derivatives.
  - cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive

  Returns a tuple of:
  - dx: Gradient with respect to x
  - dw: Gradient with respect to w
  - db: Gradient with respect to b
  """
    dx, dw, db = None, None, None
    #############################################################################
    # TODO: Implement the convolutional backward pass.                          #
    #############################################################################
    # db
    db1 = np.sum(dout, axis=0)
    db2 = np.sum(db1, axis=1)
    db = np.sum(db2, axis=1)

    # dx
    # x is after padding
    x, w, b, conv_param = cache
    N = x.shape[0]
    C = w.shape[1]
    F = dout.shape[1]
    HH = w.shape[2]
    WW = w.shape[3]
    HOUT = dout.shape[2]
    WOUT = dout.shape[3]
    stride = conv_param['stride']
    # get t from x
    t = np.zeros((N, WW * HH * C, WOUT * HOUT))
    for i in range(0, N):
        x_i = x[i, :, :, :]
        # for each image
        l = 0
        for j in range(0, HOUT):
            for k in range(0, WOUT):
                # strech out filter into a col vector
                slide_filter = x_i[:, j * stride:j * stride + HH, k * stride:k * stride + WW]
                slide_filter_col = slide_filter.reshape((WW * HH * C))
                # there are WOUT * HOUT such col vector
                t[i, :, l] = slide_filter_col
                l += 1
    # dout,(N=4,F=2,HOUT=5,WOUT=5)
    # dw,(F=2, C=3, HH=3, WW=3)
    # t,(N=4,(WW * HH * C),(HOUT * WOUT))
    dout = dout.reshape(N, F, (HOUT * WOUT))
    t = np.transpose(t, axes=(0, 2, 1))
    dw = np.zeros(w.shape)
    for i in range(0, N):
        t_i = t[i, :, :]
        dout_i = dout[i, :, :]
        # dout_i:(F,(HOUT * WOUT)) after reshaping, t_i:((HOUT * WOUT),(WW * HH * C)) after transposing
        dw_i = dout_i.dot(t_i)
        dw_i = dw_i.reshape((F, C, HH, WW))
        dw += dw_i

    # dx
    # dx:(N,C,HOUT,WOUT),dout:(N,F,(HOUT * WOUT)),w:(F=2,(C*HH*WW))
    w = w.reshape((F, (C * HH * WW)))
    # dout:(N,(HOUT * WOUT),F)
    dout = np.transpose(dout, axes=(0, 2, 1))
    # dt:(N, (HOUT * WOUT), (C*HH*WW))
    dt = dout.dot(w)
    # dt:(N, (C*HH*WW),(HOUT * WOUT))
    dt = np.transpose(dt, axes=(0, 2, 1))
    dx = np.zeros(x.shape)
    # recover dx from dt
    for i in range(0, N):
        x_i = dx[i, :, :, :]
        l = 0
        for j in range(0, HOUT):
            for k in range(0, WOUT):
                slide_filter_col = dt[i, :, l]
                slide_filter = slide_filter_col.reshape((C, HH, WW))
                # += not =, overlapping is needed
                x_i[:, j * stride:j * stride + HH, k * stride:k * stride + WW] += slide_filter
                l += 1
    # delete padding
    dx = dx[:, :, 1:-1, 1:-1]

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    return dx, dw, db


def max_pool_forward_naive(x, pool_param):
    """
  A naive implementation of the forward pass for a max pooling layer.

  Inputs:
  - x: Input data, of shape (N, C, H, W)
  - pool_param: dictionary with the following keys:
    - 'pool_height': The height of each pooling region
    - 'pool_width': The width of each pooling region
    - 'stride': The distance between adjacent pooling regions

  Returns a tuple of:
  - out: Output data
  - cache: (x, pool_param)
  """
    out = None
    #############################################################################
    # TODO: Implement the max pooling forward pass                    #
    #############################################################################
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']
    N, C, H, W = x.shape

    HOUT = (H - pool_height) / stride + 1
    WOUT = (W - pool_width) / stride + 1
    out = np.zeros((N, C, HOUT, WOUT))
    for i in range(0, N):
        x_i = x[i, :, :, :]
        for j in range(0, HOUT):
            for k in range(0, WOUT):
                slide_filter = x_i[:, j * stride:j * stride + pool_height, k * stride:k * stride + pool_width]
                max_entry = np.max(slide_filter, axis=(1, 2))
                out[i, :, j, k] = max_entry
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    cache = (x, pool_param)
    return out, cache


def max_pool_backward_naive(dout, cache):
    """
  A naive implementation of the backward pass for a max pooling layer.

  Inputs:
  - dout: Upstream derivatives
  - cache: A tuple of (x, pool_param) as in the forward pass.

  Returns:
  - dx: Gradient with respect to x
  """
    dx = None
    #############################################################################
    # TODO: Implement the max pooling backward pass                             #
    #############################################################################
    x, pool_param = cache
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']
    N, C, H, W = x.shape
    HOUT = (H - pool_height) / stride + 1
    WOUT = (W - pool_width) / stride + 1
    dx = np.zeros(x.shape)
    for i in range(0, N):
        x_i = x[i, :, :, :]
        for j in range(0, HOUT):
            for k in range(0, WOUT):
                slide_filter = x_i[:, j * stride:j * stride + pool_height, k * stride:k * stride + pool_width]
                # now 2 dimensional
                slide_filter_streched = slide_filter.reshape((C, pool_height * pool_width))
                indexes = np.argmax(slide_filter_streched, axis=1)
                # compute max entry's relative h,w in a filter
                for t in range(0, len(indexes)):
                    index = indexes[t]
                    h = index / pool_width
                    w = index % pool_width
                    # recover the absolute position in dx
                    # the gradient is 1, so the final value is just upstream derivates by chain rule
                    dx[i][t][j * stride + h][k * stride + w] = dout[i][t][j][k]
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################
    return dx


def spatial_batchnorm_forward(x, gamma, beta, bn_param):
    """
  Computes the forward pass for spatial batch normalization.

  Inputs:
  - x: Input data of shape (N, C, H, W)
  - gamma: Scale parameter, of shape (C,)
  - beta: Shift parameter, of shape (C,)
  - bn_param: Dictionary with the following keys:
    - mode: 'train' or 'test'; required
    - eps: Constant for numeric stability
    - momentum: Constant for running mean / variance. momentum=0 means that
      old information is discarded completely at every time step, while
      momentum=1 means that new information is never incorporated. The
      default of momentum=0.9 should work well in most situations.
    - running_mean: Array of shape (D,) giving running mean of features
    - running_var Array of shape (D,) giving running variance of features

  Returns a tuple of:
  - out: Output data, of shape (N, C, H, W)
  - cache: Values needed for the backward pass
  """
    out, cache = None, None

    #############################################################################
    # TODO: Implement the forward pass for spatial batch normalization.         #
    #                                                                           #
    # HINT: You can implement spatial batch normalization using the vanilla     #
    # version of batch normalization defined above. Your implementation should  #
    # be very short; ours is less than five lines.                              #
    #############################################################################
    N, C, H, W = x.shape
    # x:(N,C,H,W)->(C,N,H,W)
    x_NC_transposed = np.transpose(x, axes=(1,0,2,3))
    # stretch (H,W) to (H*W)
    x_NC_transposed_reshaped = x_NC_transposed.reshape(C,N,H*W)
    # result holder
    out = np.zeros((C,N,H*W))
    cache=[]
    # call previous batchnorm forward per depth
    for i in range(0, C):
      out[i],cachei = batchnorm_forward(x_NC_transposed_reshaped[i], gamma[i], beta[i], bn_param)
      cache.append(cachei)
    # recover result in demanded form
    out = np.transpose(out.reshape(C,N,H,W),axes=(1,0,2,3))
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return out, cache


def spatial_batchnorm_backward(dout, cache):
    """
  Computes the backward pass for spatial batch normalization.

  Inputs:
  - dout: Upstream derivatives, of shape (N, C, H, W)
  - cache: Values from the forward pass

  Returns a tuple of:
  - dx: Gradient with respect to inputs, of shape (N, C, H, W)
  - dgamma: Gradient with respect to scale parameter, of shape (C,)
  - dbeta: Gradient with respect to shift parameter, of shape (C,)
  """
    dx, dgamma, dbeta = None, None, None

    #############################################################################
    # TODO: Implement the backward pass for spatial batch normalization.        #
    #                                                                           #
    # HINT: You can implement spatial batch normalization using the vanilla     #
    # version of batch normalization defined above. Your implementation should  #
    # be very short; ours is less than five lines.                              #
    #############################################################################
    N, C, H, W = dout.shape
    # dout:(N,C,H,W)->(C,N,H,W)
    dout_NC_transposed = np.transpose(dout,axes=(1,0,2,3))
    # stretch (H,W) to (H*W)
    dout_transposed_reshaped = dout_NC_transposed.reshape((C,N,H*W))
    # result holders
    dx = np.zeros((C,N,H*W))
    dgamma = np.zeros((C,))
    dbeta = np.zeros((C,))
    # call previous batchnorm backward per depth
    for i in range(0,C):
      dxi, dgammai, dbetai = batchnorm_backward(dout_transposed_reshaped[i],cache[i])
      dx[i] = dxi
      # note dgammai and dbetai are (H*W), but we want each depth slice has only one gamma and beta
      # so we just sum over the 2d matrix to obtain a single value
      dgamma[i] = np.sum(dgammai)
      dbeta[i] = np.sum(dbetai)
    # recover result in demanded form
    dx = np.transpose(dx.reshape((C,N,H,W)),axes=(1,0,2,3))
    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################

    return dx, dgamma, dbeta


def svm_loss(x, y):
    """
  Computes the loss and gradient using for multiclass SVM classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """
    N = x.shape[0]
    correct_class_scores = x[np.arange(N), y]
    margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
    margins[np.arange(N), y] = 0
    loss = np.sum(margins) / N
    num_pos = np.sum(margins > 0, axis=1)
    dx = np.zeros_like(x)
    dx[margins > 0] = 1
    dx[np.arange(N), y] -= num_pos
    dx /= N
    return loss, dx


def softmax_loss(x, y):
    """
  Computes the loss and gradient for softmax classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """
    probs = np.exp(x - np.max(x, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    N = x.shape[0]
    loss = -np.sum(np.log(probs[np.arange(N), y])) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx

fc_net.py

import numpy as np

from cs231n.layers import *
from cs231n.layer_utils import *


class TwoLayerNet(object):
  """
  A two-layer fully-connected neural network with ReLU nonlinearity and
  softmax loss that uses a modular layer design. We assume an input dimension
  of D, a hidden dimension of H, and perform classification over C classes.
  
  The architecure should be affine - relu - affine - softmax.

  Note that this class does not implement gradient descent; instead, it
  will interact with a separate Solver object that is responsible for running
  optimization.

  The learnable parameters of the model are stored in the dictionary
  self.params that maps parameter names to numpy arrays.
  """
  
  def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
               weight_scale=1e-3, reg=0.0):
    """
    Initialize a new network.

    Inputs:
    - input_dim: An integer giving the size of the input
    - hidden_dim: An integer giving the size of the hidden layer
    - num_classes: An integer giving the number of classes to classify
    - dropout: Scalar between 0 and 1 giving dropout strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - reg: Scalar giving L2 regularization strength.
    """
    self.params = {}
    self.reg = reg
    
    ############################################################################
    # TODO: Initialize the weights and biases of the two-layer net. Weights    #
    # should be initialized from a Gaussian with standard deviation equal to   #
    # weight_scale, and biases should be initialized to zero. All weights and  #
    # biases should be stored in the dictionary self.params, with first layer  #
    # weights and biases using the keys 'W1' and 'b1' and second layer weights #
    # and biases using the keys 'W2' and 'b2'.                                 #
    ############################################################################
    W1 = np.random.randn(input_dim,hidden_dim) * weight_scale
    b1 = np.zeros(hidden_dim)
    W2 = np.random.randn(hidden_dim, num_classes) * weight_scale
    b2 = np.zeros(num_classes)
    self.params['W1'] = W1 
    self.params['b1'] =  b1
    self.params['W2'] = W2
    self.params['b2'] = b2
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################


  def loss(self, X, y=None):
    """
    Compute loss and gradient for a minibatch of data.

    Inputs:
    - X: Array of input data of shape (N, d_1, ..., d_k)
    - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

    Returns:
    If y is None, then run a test-time forward pass of the model and return:
    - scores: Array of shape (N, C) giving classification scores, where
      scores[i, c] is the classification score for X[i] and class c.

    If y is not None, then run a training-time forward and backward pass and
    return a tuple of:
    - loss: Scalar value giving the loss
    - grads: Dictionary with the same keys as self.params, mapping parameter
      names to gradients of the loss with respect to those parameters.
    """
    
    W1 = self.params['W1'] 
    b1 = self.params['b1']
    W2 = self.params['W2']
    b2 =self.params['b2']
    
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the two-layer net, computing the    #
    # class scores for X and storing them in the scores variable.              #
    ############################################################################
    dout1, cache_1 = affine_relu_forward(X, W1, b1)
    dout2, cache_2 = affine_forward(dout1, W2, b2)
    scores = dout2
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If y is None then we are in test mode so just return scores
    if y is None:
      return scores
    
    loss, grads = 0, {}
    ############################################################################
    # TODO: Implement the backward pass for the two-layer net. Store the loss  #
    # in the loss variable and gradients in the grads dictionary. Compute data #
    # loss using softmax, and make sure that grads[k] holds the gradients for  #
    # self.params[k]. Don't forget to add L2 regularization!                   #
    #                                                                          #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################

    loss, dx = softmax_loss(dout2, y)
    loss += 0.5 * self.reg * (np.sum(W1*W1) + np.sum(W2*W2))
    
    dx2, dw2, db2 = affine_backward(dx, cache_2)
    dx1, dw1, db1 = affine_relu_backward(dx2, cache_1)
    grads['W1'] = dw1 + self.reg * W1
    grads['b1'] = db1
    grads['W2'] = dw2 + self.reg * W2
    grads['b2'] = db2
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads


class FullyConnectedNet(object):
  """
  A fully-connected neural network with an arbitrary number of hidden layers,
  ReLU nonlinearities, and a softmax loss function. This will also implement
  dropout and batch normalization as options. For a network with L layers,
  the architecture will be
  
  {affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
  
  where batch normalization and dropout are optional, and the {...} block is
  repeated L - 1 times.
  
  Similar to the TwoLayerNet above, learnable parameters are stored in the
  self.params dictionary and will be learned using the Solver class.
  """

  def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
               dropout=0, use_batchnorm=False, reg=0.0,
               weight_scale=1e-2, dtype=np.float32, seed=None):
    """
    Initialize a new FullyConnectedNet.
    
    Inputs:
    - hidden_dims: A list of integers giving the size of each hidden layer.
    - input_dim: An integer giving the size of the input.
    - num_classes: An integer giving the number of classes to classify.
    - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
      the network should not use dropout at all.
    - use_batchnorm: Whether or not the network should use batch normalization.
    - reg: Scalar giving L2 regularization strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - dtype: A numpy datatype object; all computations will be performed using
      this datatype. float32 is faster but less accurate, so you should use
      float64 for numeric gradient checking.
    - seed: If not None, then pass this random seed to the dropout layers. This
      will make the dropout layers deteriminstic so we can gradient check the
      model.
    """
    self.use_batchnorm = use_batchnorm
    self.use_dropout = dropout > 0
    self.reg = reg
    self.num_layers = 1 + len(hidden_dims)
    self.dtype = dtype
    self.params = {}

    ############################################################################
    # TODO: Initialize the parameters of the network, storing all values in    #
    # the self.params dictionary. Store weights and biases for the first layer #
    # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
    # initialized from a normal distribution with standard deviation equal to  #
    # weight_scale and biases should be initialized to zero.                   #
    #                                                                          #
    # When using batch normalization, store scale and shift parameters for the #
    # first layer in gamma1 and beta1; for the second layer use gamma2 and     #
    # beta2, etc. Scale parameters should be initialized to one and shift      #
    # parameters should be initialized to zero.                                #
    ############################################################################
    fan_in, fan_out = input_dim, hidden_dims[0]
    for i in range(0, len(hidden_dims)):
        fan_out = hidden_dims[i]
        self.params['W'+str(i+1)] = np.random.randn(fan_in, fan_out) * weight_scale
        self.params['b'+str(i+1)] = np.zeros(fan_out)
        fan_in = fan_out
        if self.use_batchnorm == True:
            self.params["gamma"+str(i+1)] = np.ones((1,fan_out))
            self.params["beta"+str(i+1)] = np.zeros((1,fan_out))
    self.params['W'+ str(self.num_layers)] = np.random.randn(hidden_dims[-1], num_classes) * weight_scale
    self.params['b'+ str(self.num_layers)] = np.zeros(num_classes)
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # When using dropout we need to pass a dropout_param dictionary to each
    # dropout layer so that the layer knows the dropout probability and the mode
    # (train / test). You can pass the same dropout_param to each dropout layer.
    self.dropout_param = {}
    if self.use_dropout:
      self.dropout_param = {'mode': 'train', 'p': dropout}
      if seed is not None:
        self.dropout_param['seed'] = seed
    
    # With batch normalization we need to keep track of running means and
    # variances, so we need to pass a special bn_param object to each batch
    # normalization layer. You should pass self.bn_params[0] to the forward pass
    # of the first batch normalization layer, self.bn_params[1] to the forward
    # pass of the second batch normalization layer, etc.
    self.bn_params = []
    if self.use_batchnorm:
      self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]
    
    # Cast all parameters to the correct datatype
    for k, v in self.params.iteritems():
      self.params[k] = v.astype(dtype)

  def loss(self, X, y=None):
    """
    Compute loss and gradient for the fully-connected net.

    Input / output: Same as TwoLayerNet above.
    """
    X = X.astype(self.dtype)
    mode = 'test' if y is None else 'train'

    # Set train/test mode for batchnorm params and dropout param since they
    # behave differently during training and testing.
    if self.dropout_param is not None:
      self.dropout_param['mode'] = mode   
    if self.use_batchnorm:
      for bn_param in self.bn_params:
        bn_param[mode] = mode

    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the fully-connected net, computing  #
    # the class scores for X and storing them in the scores variable.          #
    #                                                                          #
    # When using dropout, you'll need to pass self.dropout_param to each       #
    # dropout forward pass.                                                    #
    #                                                                          #
    # When using batch normalization, you'll need to pass self.bn_params[0] to #
    # the forward pass for the first batch normalization layer, pass           #
    # self.bn_params[1] to the forward pass for the second batch normalization #
    # layer, etc.                                                              #
    ############################################################################
    if self.use_batchnorm == True:
        len_hidden_dims = self.num_layers - 1
        caches = []
        out = X
        #print self.params.keys()
        for i in range(0, len_hidden_dims):
            w = self.params["W"+str(i+1)]
            b = self.params["b"+str(i+1)]
            # affine
            out, cache1 = affine_forward(out, w, b)
            # batch normalization, need not worry mode as it is in bn_param[i]
            gamma = self.params["gamma"+str(i+1)]
            beta = self.params["beta"+str(i+1)]
            out, cache2 = batchnorm_forward(out, gamma, beta, self.bn_params[i])
            # relu
            out, cache3 = relu_forward(out)
            if self.use_dropout == True:
                # dropout
                out, cache4 = dropout_forward(out, self.dropout_param)
            # merge cache:
            # cache[0] affine_forward
            # cache[1] batch normalization
            # cache[2] relu
            cache = []
            cache.append(cache1)
            cache.append(cache2)
            cache.append(cache3)
            if self.use_dropout == True:
                # dropout
                cache.append(cache4)
            caches.append(cache)

        w_last_layer = self.params['W'+ str(self.num_layers)]
        b_last_layer = self.params['b'+ str(self.num_layers)]
        out,cache = affine_forward(out, w_last_layer, b_last_layer)
        scores = out
        caches.append(cache)
    if self.use_batchnorm == False:
        len_hidden_dims = self.num_layers - 1
        caches = []
        out = X
        #print self.params.keys()
        for i in range(0, len_hidden_dims):
            w = self.params["W"+str(i+1)]
            b = self.params["b"+str(i+1)]
            cache = []
            # affine_relu
            out, cache1 = affine_relu_forward(out, w, b)
            if self.use_dropout == True:
                # dropout
                out, cache2 = dropout_forward(out, self.dropout_param)
            # merge cache
            cache.append(cache1)
            if self.use_dropout == True:
                cache.append(cache2)
            caches.append(cache)

        w_last_layer = self.params['W'+ str(self.num_layers)]
        b_last_layer = self.params['b'+ str(self.num_layers)]
        out,cache = affine_forward(out, w_last_layer, b_last_layer)
        scores = out
        caches.append(cache)
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If test mode return early
    if mode == 'test':
      return scores

    loss, grads = 0.0, {}
    ############################################################################
    # TODO: Implement the backward pass for the fully-connected net. Store the #
    # loss in the loss variable and gradients in the grads dictionary. Compute #
    # data loss using softmax, and make sure that grads[k] holds the gradients #
    # for self.params[k]. Don't forget to add L2 regularization!               #
    #                                                                          #
    # When using batch normalization, you don't need to regularize the scale   #
    # and shift parameters.                                                    #
    #                                                                          #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################
    #print "out.shape",out.shape,"y.shape",y.shape
    if self.use_batchnorm == True:
        loss, grad = softmax_loss(out, y)
        # loss regulazation
        for i in range(1, self.num_layers+1):
            loss += 0.5 * self.reg * np.sum(self.params["W"+str(i)]*self.params["W"+str(i)]) 
        dout = grad
        caches.reverse()
        dout, dw, db = affine_backward(dout, caches[0])
        # weight regulazation
        grads['W'+ str(self.num_layers)] = dw + self.reg * self.params['W'+ str(self.num_layers)]
        grads['b'+ str(self.num_layers)] = db
        for i in range(1, len(caches)):
            if self.use_dropout == True:
                dout = dropout_backward(dout, caches[i][3])
            dout = relu_backward(dout, caches[i][2])
            dout, dgamma, dbeta = batchnorm_backward(dout, caches[i][1])
            dout, dw, db = affine_backward(dout, caches[i][0])
            grads['W'+ str(self.num_layers-i)] = dw + self.reg * self.params['W'+ str(self.num_layers-i)]
            grads['b'+ str(self.num_layers-i)] = db
            grads['gamma'+ str(self.num_layers-i)] = dgamma
            grads['beta'+ str(self.num_layers-i)] = dbeta
    if self.use_batchnorm == False:  
        loss, grad = softmax_loss(out, y)
        # loss regulazation
        for i in range(1, self.num_layers+1):
            loss += 0.5 * self.reg * np.sum(self.params["W"+str(i)]*self.params["W"+str(i)]) 
        dx = grad
        caches.reverse()
        dx, dw, db = affine_backward(dx, caches[0])
        # weight regulazation
        grads['W'+ str(self.num_layers)] = dw + self.reg * self.params['W'+ str(self.num_layers)]
        grads['b'+ str(self.num_layers)] = db
        for i in range(1, len(caches)):
            if self.use_dropout == True:
                dx = dropout_backward(dx, caches[i][1])
            dx, dw, db = affine_relu_backward(dx, caches[i][0])
            grads['W'+ str(self.num_layers-i)] = dw + self.reg * self.params['W'+ str(self.num_layers-i)]
            grads['b'+ str(self.num_layers-i)] = db
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads

solver.py

import numpy as np

from cs231n import optim


class Solver(object):
  """
  A Solver encapsulates all the logic necessary for training classification
  models. The Solver performs stochastic gradient descent using different
  update rules defined in optim.py.

  The solver accepts both training and validataion data and labels so it can
  periodically check classification accuracy on both training and validation
  data to watch out for overfitting.

  To train a model, you will first construct a Solver instance, passing the
  model, dataset, and various optoins (learning rate, batch size, etc) to the
  constructor. You will then call the train() method to run the optimization
  procedure and train the model.
  
  After the train() method returns, model.params will contain the parameters
  that performed best on the validation set over the course of training.
  In addition, the instance variable solver.loss_history will contain a list
  of all losses encountered during training and the instance variables
  solver.train_acc_history and solver.val_acc_history will be lists containing
  the accuracies of the model on the training and validation set at each epoch.
  
  Example usage might look something like this:
  
  data = {
    'X_train': # training data
    'y_train': # training labels
    'X_val': # validation data
    'X_train': # validation labels
  }
  model = MyAwesomeModel(hidden_size=100, reg=10)
  solver = Solver(model, data,
                  update_rule='sgd',
                  optim_config={
                    'learning_rate': 1e-3,
                  },
                  lr_decay=0.95,
                  num_epochs=10, batch_size=100,
                  print_every=100)
  solver.train()


  A Solver works on a model object that must conform to the following API:

  - model.params must be a dictionary mapping string parameter names to numpy
    arrays containing parameter values.

  - model.loss(X, y) must be a function that computes training-time loss and
    gradients, and test-time classification scores, with the following inputs
    and outputs:

    Inputs:
    - X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k)
    - y: Array of labels, of shape (N,) giving labels for X where y[i] is the
      label for X[i].

    Returns:
    If y is None, run a test-time forward pass and return:
    - scores: Array of shape (N, C) giving classification scores for X where
      scores[i, c] gives the score of class c for X[i].

    If y is not None, run a training time forward and backward pass and return
    a tuple of:
    - loss: Scalar giving the loss
    - grads: Dictionary with the same keys as self.params mapping parameter
      names to gradients of the loss with respect to those parameters.
  """

  def __init__(self, model, data, **kwargs):
    """
    Construct a new Solver instance.
    
    Required arguments:
    - model: A model object conforming to the API described above
    - data: A dictionary of training and validation data with the following:
      'X_train': Array of shape (N_train, d_1, ..., d_k) giving training images
      'X_val': Array of shape (N_val, d_1, ..., d_k) giving validation images
      'y_train': Array of shape (N_train,) giving labels for training images
      'y_val': Array of shape (N_val,) giving labels for validation images
      
    Optional arguments:
    - update_rule: A string giving the name of an update rule in optim.py.
      Default is 'sgd'.
    - optim_config: A dictionary containing hyperparameters that will be
      passed to the chosen update rule. Each update rule requires different
      hyperparameters (see optim.py) but all update rules require a
      'learning_rate' parameter so that should always be present.
    - lr_decay: A scalar for learning rate decay; after each epoch the learning
      rate is multiplied by this value.
    - batch_size: Size of minibatches used to compute loss and gradient during
      training.
    - num_epochs: The number of epochs to run for during training.
    - print_every: Integer; training losses will be printed every print_every
      iterations.
    - verbose: Boolean; if set to false then no output will be printed during
      training.
    """
    self.model = model
    self.X_train = data['X_train']
    self.y_train = data['y_train']
    self.X_val = data['X_val']
    self.y_val = data['y_val']
    
    # Unpack keyword arguments
    self.update_rule = kwargs.pop('update_rule', 'sgd')
    self.optim_config = kwargs.pop('optim_config', {})
    self.lr_decay = kwargs.pop('lr_decay', 1.0)
    self.batch_size = kwargs.pop('batch_size', 100)
    self.num_epochs = kwargs.pop('num_epochs', 10)

    self.print_every = kwargs.pop('print_every', 10)
    self.verbose = kwargs.pop('verbose', True)

    # Throw an error if there are extra keyword arguments
    if len(kwargs) > 0:
      extra = ', '.join('"%s"' % k for k in kwargs.keys())
      raise ValueError('Unrecognized arguments %s' % extra)

    # Make sure the update rule exists, then replace the string
    # name with the actual function
    if not hasattr(optim, self.update_rule):
      raise ValueError('Invalid update_rule "%s"' % self.update_rule)
    self.update_rule = getattr(optim, self.update_rule)

    self._reset()


  def _reset(self):
    """
    Set up some book-keeping variables for optimization. Don't call this
    manually.
    """
    # Set up some variables for book-keeping
    self.epoch = 0
    self.best_val_acc = 0
    self.best_params = {}
    self.loss_history = []
    self.train_acc_history = []
    self.val_acc_history = []

    # Make a deep copy of the optim_config for each parameter
    self.optim_configs = {}
    for p in self.model.params:
      d = {k: v for k, v in self.optim_config.iteritems()}
      self.optim_configs[p] = d


  def _step(self):
    """
    Make a single gradient update. This is called by train() and should not
    be called manually.
    """
    # Make a minibatch of training data
    num_train = self.X_train.shape[0]
    batch_mask = np.random.choice(num_train, self.batch_size)
    X_batch = self.X_train[batch_mask]
    y_batch = self.y_train[batch_mask]

    # Compute loss and gradient
    loss, grads = self.model.loss(X_batch, y_batch)
    self.loss_history.append(loss)

    # Perform a parameter update
    for p, w in self.model.params.iteritems():
      dw = grads[p]
      config = self.optim_configs[p]
      next_w, next_config = self.update_rule(w, dw, config)
      self.model.params[p] = next_w
      self.optim_configs[p] = next_config


  def check_accuracy(self, X, y, num_samples=None, batch_size=100):
    """
    Check accuracy of the model on the provided data.
    
    Inputs:
    - X: Array of data, of shape (N, d_1, ..., d_k)
    - y: Array of labels, of shape (N,)
    - num_samples: If not None, subsample the data and only test the model
      on num_samples datapoints.
    - batch_size: Split X and y into batches of this size to avoid using too
      much memory.
      
    Returns:
    - acc: Scalar giving the fraction of instances that were correctly
      classified by the model.
    """
    
    # Maybe subsample the data
    N = X.shape[0]
    if num_samples is not None and N > num_samples:
      mask = np.random.choice(N, num_samples)
      N = num_samples
      X = X[mask]
      y = y[mask]

    # Compute predictions in batches
    num_batches = N / batch_size
    if N % batch_size != 0:
      num_batches += 1
    y_pred = []
    for i in xrange(num_batches):
      start = i * batch_size
      end = (i + 1) * batch_size
      scores = self.model.loss(X[start:end])
      y_pred.append(np.argmax(scores, axis=1))
    y_pred = np.hstack(y_pred)
    acc = np.mean(y_pred == y)

    return acc


  def train(self):
    """
    Run optimization to train the model.
    """
    num_train = self.X_train.shape[0]
    iterations_per_epoch = max(num_train / self.batch_size, 1)
    num_iterations = self.num_epochs * iterations_per_epoch

    for t in xrange(num_iterations):
      self._step()

      # Maybe print training loss
      if self.verbose and t % self.print_every == 0:
        print '(Iteration %d / %d) loss: %f' % (
               t + 1, num_iterations, self.loss_history[-1])

      # At the end of every epoch, increment the epoch counter and decay the
      # learning rate.
      epoch_end = (t + 1) % iterations_per_epoch == 0
      if epoch_end:
        self.epoch += 1
        for k in self.optim_configs:
          self.optim_configs[k]['learning_rate'] *= self.lr_decay

      # Check train and val accuracy on the first iteration, the last
      # iteration, and at the end of each epoch.
      first_it = (t == 0)
      last_it = (t == num_iterations + 1)
      if first_it or last_it or epoch_end:
        train_acc = self.check_accuracy(self.X_train, self.y_train,
                                        num_samples=1000)
        val_acc = self.check_accuracy(self.X_val, self.y_val)
        self.train_acc_history.append(train_acc)
        self.val_acc_history.append(val_acc)

        if self.verbose:
          print '(Epoch %d / %d) train acc: %f; val_acc: %f' % (
                 self.epoch, self.num_epochs, train_acc, val_acc)

        # Keep track of the best model
        if val_acc > self.best_val_acc:
          self.best_val_acc = val_acc
          self.best_params = {}
          for k, v in self.model.params.iteritems():
            self.best_params[k] = v.copy()

    # At the end of training swap the best params into the model
    self.model.params = self.best_params

optim.py

import numpy as np

"""
This file implements various first-order update rules that are commonly used for
training neural networks. Each update rule accepts current weights and the
gradient of the loss with respect to those weights and produces the next set of
weights. Each update rule has the same interface:

def update(w, dw, config=None):

Inputs:
  - w: A numpy array giving the current weights.
  - dw: A numpy array of the same shape as w giving the gradient of the
    loss with respect to w.
  - config: A dictionary containing hyperparameter values such as learning rate,
    momentum, etc. If the update rule requires caching values over many
    iterations, then config will also hold these cached values.

Returns:
  - next_w: The next point after the update.
  - config: The config dictionary to be passed to the next iteration of the
    update rule.

NOTE: For most update rules, the default learning rate will probably not perform
well; however the default values of the other hyperparameters should work well
for a variety of different problems.

For efficiency, update rules may perform in-place updates, mutating w and
setting next_w equal to w.
"""


def sgd(w, dw, config=None):
  """
  Performs vanilla stochastic gradient descent.

  config format:
  - learning_rate: Scalar learning rate.
  """
  if config is None: config = {}
  config.setdefault('learning_rate', 1e-2)

  w -= config['learning_rate'] * dw
  return w, config


def sgd_momentum(w, dw, config=None):
  """
  Performs stochastic gradient descent with momentum.

  config format:
  - learning_rate: Scalar learning rate.
  - momentum: Scalar between 0 and 1 giving the momentum value.
    Setting momentum = 0 reduces to sgd.
  - velocity: A numpy array of the same shape as w and dw used to store a moving
    average of the gradients.
  """
  if config is None: config = {}
  config.setdefault('learning_rate', 1e-2)
  config.setdefault('momentum', 0.9)
  v = config.get('velocity', np.zeros_like(w))
  
  learning_rate = config['learning_rate']
  m = config['momentum']
    
  next_w = w
  #############################################################################
  # TODO: Implement the momentum update formula. Store the updated value in   #
  # the next_w variable. You should also use and update the velocity v.       #
  #############################################################################
  v  = m * v - learning_rate * dw
  next_w += v   
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################
  config['velocity'] = v

  return next_w, config



def rmsprop(x, dx, config=None):
  """
  Uses the RMSProp update rule, which uses a moving average of squared gradient
  values to set adaptive per-parameter learning rates.

  config format:
  - learning_rate: Scalar learning rate.
  - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
    gradient cache.
  - epsilon: Small scalar used for smoothing to avoid dividing by zero.
  - cache: Moving average of second moments of gradients.
  """
  if config is None: config = {}
  config.setdefault('learning_rate', 1e-2)
  config.setdefault('decay_rate', 0.99)
  config.setdefault('epsilon', 1e-8)
  config.setdefault('cache', np.zeros_like(x))

  learning_rate = config['learning_rate']
  decay_rate = config['decay_rate']
  epsilon = config['epsilon']
  cache = config['cache']
  next_x = None
  #############################################################################
  # TODO: Implement the RMSprop update formula, storing the next value of x   #
  # in the next_x variable. Don't forget to update cache value stored in      #  
  # config['cache'].                                                          #
  #############################################################################
  cache = decay_rate * cache + (1 - decay_rate) * dx**2
  next_x = x - learning_rate * dx / (np.sqrt(cache) + epsilon)
  config['cache'] = cache   
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return next_x, config


def adam(x, dx, config=None):
  """
  Uses the Adam update rule, which incorporates moving averages of both the
  gradient and its square and a bias correction term.

  config format:
  - learning_rate: Scalar learning rate.
  - beta1: Decay rate for moving average of first moment of gradient.
  - beta2: Decay rate for moving average of second moment of gradient.
  - epsilon: Small scalar used for smoothing to avoid dividing by zero.
  - m: Moving average of gradient.
  - v: Moving average of squared gradient.
  - t: Iteration number.
  """
  if config is None: config = {}
  config.setdefault('learning_rate', 1e-3)
  config.setdefault('beta1', 0.9)
  config.setdefault('beta2', 0.999)
  config.setdefault('epsilon', 1e-8)
  config.setdefault('m', np.zeros_like(x))
  config.setdefault('v', np.zeros_like(x))
  config.setdefault('t', 0)
  
  learning_rate = config['learning_rate']
  beta1 = config['beta1']
  beta2 = config['beta2']
  epsilon = config['epsilon']
  m = config['m']
  v = config['v']
  t = config['t']   
    
  next_x = None
  #############################################################################
  # TODO: Implement the Adam update formula, storing the next value of x in   #
  # the next_x variable. Don't forget to update the m, v, and t variables     #
  # stored in config.                                                         #
  #############################################################################
  t = t + 1 
  m = beta1*m + (1-beta1)*dx
  v = beta2*v + (1-beta2)*(dx**2)
  m1 = m / (1 - beta1**t)
  v1 = v / (1 - beta2**t)
  next_x = x - learning_rate * m1 / (np.sqrt(v1) + epsilon)   
  config['m'] = m
  config['v'] = v
  config['t'] = t                                 
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################
  
  return next_x, config

weight_scale = 1e-2
3 layer
learning_rate = 1e-2
Multilayer network

5 layer
learning_rate = 1e-2
weight_scale = 5e-2

train a good model
adam only, learning_rate 6e-4

Conv Net

import numpy as np

from cs231n.layers import *
from cs231n.fast_layers import *
from cs231n.layer_utils import *

# - [conv-relu-pool]XN - [affine]XM - [softmax or SVM]
class ThreeLayerConvNetWithConvReluPoolX2Naive(object):
    """
    A three-layer convolutional network with the following architecture:
  
    [conv - relu - 2x2 max pool]*2 - affine - relu - affine - softmax
  
    The network operates on minibatches of data that have shape (N, C, H, W)
    consisting of N images, each with height H and width W and with C input
    channels.
    """

    def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
                 hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
                 dtype=np.float32):
        """
        Initialize a new network.
    
        Inputs:
        - input_dim: Tuple (C, H, W) giving size of input data
        - num_filters: Number of filters to use in the convolutional layer
        - filter_size: Size of filters to use in the convolutional layer
        - hidden_dim: Number of units to use in the fully-connected hidden layer
        - num_classes: Number of scores to produce from the final affine layer.
        - weight_scale: Scalar giving standard deviation for random initialization
          of weights.
        - reg: Scalar giving L2 regularization strength
        - dtype: numpy datatype to use for computation.
        """
        self.params = {}
        self.reg = reg
        self.dtype = dtype
        ############################################################################
        # TODO: Initialize weights and biases for the three-layer convolutional    #
        # network. Weights should be initialized from a Gaussian with standard     #
        # deviation equal to weight_scale; biases should be initialized to zero.   #
        # All weights and biases should be stored in the dictionary self.params.   #
        # Store weights and biases for the convolutional layer using the keys 'W1' #
        # and 'b1'; use keys 'W2' and 'b2' for the weights and biases of the       #
        # hidden affine layer, and keys 'W3' and 'b3' for the weights and biases   #
        # of the output affine layer.                                              #
        ############################################################################
        C, H, W = input_dim

        # first conv layer init
        F_CONV1  = num_filters
        HH_CONV1 = filter_size
        WW_CONV1 = filter_size
        W_CONV1 = np.random.randn(F_CONV1, C, HH_CONV1, WW_CONV1) * weight_scale
        b_CONV1 = np.zeros(F_CONV1)

        # second conv layer init
        F_CONV2  = num_filters
        HH_CONV2 = filter_size
        WW_CONV2 = filter_size
        W_CONV2 = np.random.randn(F_CONV2, F_CONV1, HH_CONV2, WW_CONV2) * weight_scale
        b_CONV2 = np.zeros(F_CONV2)

        # first affine layer
        H_AFF1 = hidden_dim
        W_AFF1 = np.random.randn(F_CONV2 * (H / 4) * (W / 4), H_AFF1) * weight_scale
        b_AFF1 = np.zeros(H_AFF1)
        # last affine layer
        H_LAST_AFF = num_classes
        W_LAST_AFF = np.random.randn(H_AFF1, H_LAST_AFF) * weight_scale
        b_LAST_AFF= np.zeros(H_LAST_AFF)

        # store them to self.prams for later use
        self.params['W_CONV1'] = W_CONV1
        self.params['b_CONV1'] = b_CONV1
        self.params['W_CONV2'] = W_CONV2
        self.params['b_CONV2'] = b_CONV2
        self.params['W_AFF1'] = W_AFF1
        self.params['b_AFF1'] = b_AFF1
        self.params['W_LAST_AFF'] = W_LAST_AFF
        self.params['b_LAST_AFF'] = b_LAST_AFF
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        for k, v in self.params.iteritems():
            #print "k",k,"v",v
            self.params[k] = v.astype(dtype)

    def loss(self, X, y=None):
        """
        Evaluate loss and gradient for the three-layer convolutional network.
    
        Input / output: Same API as TwoLayerNet in fc_net.py.
        """
        W_CONV1, b_CONV1 = self.params['W_CONV1'], self.params['b_CONV1']
        W_CONV2, b_CONV2 = self.params['W_CONV2'], self.params['b_CONV2']
        W_AFF1, b_AFF1 = self.params['W_AFF1'], self.params['b_AFF1']
        W_LAST_AFF, b_LAST_AFF = self.params['W_LAST_AFF'], self.params['b_LAST_AFF']

        # pass conv_param to the forward pass for the convolutional layer
        # 2 conv layer share same conv_params and pool_params
        # may vary them later
        filter_size = W_CONV1.shape[2]
        conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2}

        # pass pool_param to the forward pass for the max-pooling layer
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the three-layer convolutional net,  #
        # computing the class scores for X and storing them in the scores          #
        # variable.                                                                #
        ############################################################################
        out_conv_relu_pool_1, cache_conv_relu_pool_1 = conv_relu_pool_forward(X, W_CONV1, b_CONV1, conv_param, pool_param)
        out_conv_relu_pool_2, cache_conv_relu_pool_2 = conv_relu_pool_forward(out_conv_relu_pool_1, W_CONV2, b_CONV2, conv_param, pool_param)

        # print "out_conv_relu_pool",out_conv_relu_pool.shape
        # print "W2",W2.shape
        # print "b2",b2.shape
        out_affine_relu_1, cache_affine_relu_1 = affine_relu_forward(out_conv_relu_pool_2, W_AFF1, b_AFF1)
        out, cache_affine_last = affine_forward(out_affine_relu_1, W_LAST_AFF, b_LAST_AFF)
        scores = out
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the three-layer convolutional net, #
        # storing the loss and gradients in the loss and grads variables. Compute  #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        ############################################################################
        loss, dx = softmax_loss(scores, y)
        loss += 0.5 * self.reg * (np.sum(W_CONV1 * W_CONV1) + np.sum(W_CONV2 * W_CONV2) + np.sum(W_AFF1 * W_AFF1) + np.sum(W_LAST_AFF * W_LAST_AFF))

        # gradients, cool
        dout_last_aff, dw_last_aff, db_last_aff = affine_backward(dx, cache_affine_last)
        grads['W_LAST_AFF'] = dw_last_aff + self.reg * W_LAST_AFF
        grads['b_LAST_AFF'] = db_last_aff

        dout_first_affine_relu_aff1, dw_aff1, db_aff1 = affine_relu_backward(dout_last_aff, cache_affine_relu_1)
        grads['W_AFF1'] = dw_aff1 + self.reg * W_AFF1
        grads['b_AFF1'] = db_aff1

        dout_conv2, dw_conv2, db_conv2 = conv_relu_pool_backward(dout_first_affine_relu_aff1, cache_conv_relu_pool_2)
        grads['W_CONV2'] = dw_conv2 + self.reg * W_CONV2
        grads['b_CONV2'] = db_conv2

        dout_conv1, dw_conv1, db_conv1 = conv_relu_pool_backward(dout_conv2, cache_conv_relu_pool_1)
        grads['W_CONV1'] = dw_conv1 + self.reg * W_CONV1
        grads['b_CONV1'] = db_conv1
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads


# - [conv-relu-pool]XN - [affine]XM - [softmax or SVM]
class ThreeLayerConvNetWithConvReluPoolX2NaiveSpatialBN(object):
    """
    A three-layer convolutional network with the following architecture:

    [conv - relu - 2x2 max pool]*2 - affine - relu - affine - softmax

    The network operates on minibatches of data that have shape (N, C, H, W)
    consisting of N images, each with height H and width W and with C input
    channels.
    """

    def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
                 hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
                 dtype=np.float32):
        """
        Initialize a new network.

        Inputs:
        - input_dim: Tuple (C, H, W) giving size of input data
        - num_filters: Number of filters to use in the convolutional layer
        - filter_size: Size of filters to use in the convolutional layer
        - hidden_dim: Number of units to use in the fully-connected hidden layer
        - num_classes: Number of scores to produce from the final affine layer.
        - weight_scale: Scalar giving standard deviation for random initialization
          of weights.
        - reg: Scalar giving L2 regularization strength
        - dtype: numpy datatype to use for computation.
        """
        self.params = {}
        self.reg = reg
        self.dtype = dtype
        ############################################################################
        # TODO: Initialize weights and biases for the three-layer convolutional    #
        # network. Weights should be initialized from a Gaussian with standard     #
        # deviation equal to weight_scale; biases should be initialized to zero.   #
        # All weights and biases should be stored in the dictionary self.params.   #
        # Store weights and biases for the convolutional layer using the keys 'W1' #
        # and 'b1'; use keys 'W2' and 'b2' for the weights and biases of the       #
        # hidden affine layer, and keys 'W3' and 'b3' for the weights and biases   #
        # of the output affine layer.                                              #
        ############################################################################
        self.bn_params = []
        if self.use_batchnorm:
            self.bn_params = [{'mode': 'train'} for i in range(0,2)]



        C, H, W = input_dim

        # first conv layer init
        F_CONV1 = num_filters
        HH_CONV1 = filter_size
        WW_CONV1 = filter_size
        W_CONV1 = np.random.randn(F_CONV1, C, HH_CONV1, WW_CONV1) * weight_scale
        b_CONV1 = np.zeros(F_CONV1)

        # second conv layer init
        F_CONV2 = num_filters
        HH_CONV2 = filter_size
        WW_CONV2 = filter_size
        W_CONV2 = np.random.randn(F_CONV2, F_CONV1, HH_CONV2, WW_CONV2) * weight_scale
        b_CONV2 = np.zeros(F_CONV2)

        # first affine layer
        H_AFF1 = hidden_dim
        W_AFF1 = np.random.randn(F_CONV2 * (H / 4) * (W / 4), H_AFF1) * weight_scale
        b_AFF1 = np.zeros(H_AFF1)
        # last affine layer
        H_LAST_AFF = num_classes
        W_LAST_AFF = np.random.randn(H_AFF1, H_LAST_AFF) * weight_scale
        b_LAST_AFF = np.zeros(H_LAST_AFF)

        # store them to self.prams for later use
        self.params['W_CONV1'] = W_CONV1
        self.params['b_CONV1'] = b_CONV1
        self.params['W_CONV2'] = W_CONV2
        self.params['b_CONV2'] = b_CONV2
        self.params['W_AFF1'] = W_AFF1
        self.params['b_AFF1'] = b_AFF1
        self.params['W_LAST_AFF'] = W_LAST_AFF
        self.params['b_LAST_AFF'] = b_LAST_AFF
        self.params["Gamma1"] = np.ones((1, F_CONV1))
        self.params["Beta1"] = np.zeros((1, F_CONV1))
        self.params["Gamma2"] = np.ones((1, F_CONV2))
        self.params["Beta2"] = np.zeros((1, F_CONV2))
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        for k, v in self.params.iteritems():
            # print "k",k,"v",v
            self.params[k] = v.astype(dtype)

    def loss(self, X, y=None):
        """
        Evaluate loss and gradient for the three-layer convolutional network.

        Input / output: Same API as TwoLayerNet in fc_net.py.
        """
        W_CONV1, b_CONV1 = self.params['W_CONV1'], self.params['b_CONV1']
        W_CONV2, b_CONV2 = self.params['W_CONV2'], self.params['b_CONV2']
        W_AFF1, b_AFF1 = self.params['W_AFF1'], self.params['b_AFF1']
        W_LAST_AFF, b_LAST_AFF = self.params['W_LAST_AFF'], self.params['b_LAST_AFF']
        Gamma1 = self.params["Gamma1"]
        Beta1 = self.params["Beta1"]
        Gamma2 = self.params["Gamma2"]
        Beta2 = self.params["Beta2"]
        # pass conv_param to the forward pass for the convolutional layer
        # 2 conv layer share same conv_params and pool_params
        # may vary them later
        filter_size = W_CONV1.shape[2]
        conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2}

        # pass pool_param to the forward pass for the max-pooling layer
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the three-layer convolutional net,  #
        # computing the class scores for X and storing them in the scores          #
        # variable.                                                                #
        ############################################################################
        out_conv_relu_pool_1, cache_conv_relu_pool_1 = conv_relu_pool_forward(X, W_CONV1, b_CONV1, conv_param,
                                                                              pool_param)
        out_conv_relu_pool_2, cache_conv_relu_pool_2 = conv_relu_pool_forward(out_conv_relu_pool_1, W_CONV2, b_CONV2,
                                                                              conv_param, pool_param)

        # print "out_conv_relu_pool",out_conv_relu_pool.shape
        # print "W2",W2.shape
        # print "b2",b2.shape
        out_affine_relu_1, cache_affine_relu_1 = affine_relu_forward(out_conv_relu_pool_2, W_AFF1, b_AFF1)
        out, cache_affine_last = affine_forward(out_affine_relu_1, W_LAST_AFF, b_LAST_AFF)
        scores = out
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the three-layer convolutional net, #
        # storing the loss and gradients in the loss and grads variables. Compute  #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        ############################################################################
        loss, dx = softmax_loss(scores, y)
        loss += 0.5 * self.reg * (
        np.sum(W_CONV1 * W_CONV1) + np.sum(W_CONV2 * W_CONV2) + np.sum(W_AFF1 * W_AFF1)) + np.sum(
            W_LAST_AFF * W_LAST_AFF)

        # gradients, cool
        dout_last_aff, dw_last_aff, db_last_aff = affine_backward(dx, cache_affine_last)
        grads['W_LAST_AFF'] = dw_last_aff + self.reg * W_LAST_AFF
        grads['b_LAST_AFF'] = db_last_aff

        dout_first_affine_relu_aff1, dw_aff1, db_aff1 = affine_relu_backward(dout_last_aff, cache_affine_relu_1)
        grads['W_AFF1'] = dw_aff1 + self.reg * W_AFF1
        grads['b_AFF1'] = db_aff1

        dout_conv2, dw_conv2, db_conv2 = conv_relu_pool_backward(dout_first_affine_relu_aff1, cache_conv_relu_pool_2)
        grads['W_CONV2'] = dw_conv2 + self.reg * W_CONV2
        grads['b_CONV2'] = db_conv2

        dout_conv1, dw_conv1, db_conv1 = conv_relu_pool_backward(dout_conv2, cache_conv_relu_pool_1)
        grads['W_CONV1'] = dw_conv1 + self.reg * W_CONV1
        grads['b_CONV1'] = db_conv1
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
import numpy as np

from cs231n.layers import *
from cs231n.fast_layers import *
from cs231n.layer_utils import *

# - [conv-relu-pool]XN - [affine]XM - [softmax or SVM]
class ThreeLayerConvNetWithConvReluPoolXNNaive(object):
    """
    A three-layer convolutional network with the following architecture:
  
    [conv - relu - 2x2 max pool]*2 - affine - relu - affine - softmax
  
    The network operates on minibatches of data that have shape (N, C, H, W)
    consisting of N images, each with height H and width W and with C input
    channels.
    """

    def __init__(self, input_dim=(3, 32, 32), num_filters_list=[32], filter_sizes=[7],
                 hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
                 dtype=np.float32, conv_relu_pool_num=2):
        """
        Initialize a new network.
    
        Inputs:
        - input_dim: Tuple (C, H, W) giving size of input data
        - num_filters: Number of filters to use in the convolutional layer
        - filter_size: Size of filters to use in the convolutional layer
        - hidden_dim: Number of units to use in the fully-connected hidden layer
        - num_classes: Number of scores to produce from the final affine layer.
        - weight_scale: Scalar giving standard deviation for random initialization
          of weights.
        - reg: Scalar giving L2 regularization strength
        - dtype: numpy datatype to use for computation.
        """
        self.params = {}
        self.reg = reg
        self.dtype = dtype
        ############################################################################
        # TODO: Initialize weights and biases for the three-layer convolutional    #
        # network. Weights should be initialized from a Gaussian with standard     #
        # deviation equal to weight_scale; biases should be initialized to zero.   #
        # All weights and biases should be stored in the dictionary self.params.   #
        # Store weights and biases for the convolutional layer using the keys 'W1' #
        # and 'b1'; use keys 'W2' and 'b2' for the weights and biases of the       #
        # hidden affine layer, and keys 'W3' and 'b3' for the weights and biases   #
        # of the output affine layer.                                              #
        ############################################################################
        C, H, W = input_dim
        self.conv_relu_pool_num = len(num_filters_list)
        self.filter_sizes = filter_sizes
        # first conv layer init
        fan_in = None
        fan_out = C
        for i in range(0, self.conv_relu_pool_num):
            fan_in = num_filters_list[i]
            self.params["W_CONV" + str(i+1)] = np.random.randn(fan_in, fan_out, filter_sizes[i], filter_sizes[i]) * weight_scale
            self.params["b_CONV" + str(i+1)] = np.zeros(fan_in)
            fan_out = num_filters_list[i]

        # first affine layer
        H_AFF1 = hidden_dim
        W_AFF1 = np.random.randn(fan_in * (H / (2 ** len(num_filters_list))) * (W / (2 ** len(num_filters_list))), H_AFF1) * weight_scale
        b_AFF1 = np.zeros(H_AFF1)
        # last affine layer
        H_LAST_AFF = num_classes
        W_LAST_AFF = np.random.randn(H_AFF1, H_LAST_AFF) * weight_scale
        b_LAST_AFF= np.zeros(H_LAST_AFF)

        # store non-conv params self.prams for later use
        self.params['W_AFF1'] = W_AFF1
        self.params['b_AFF1'] = b_AFF1
        self.params['W_LAST_AFF'] = W_LAST_AFF
        self.params['b_LAST_AFF'] = b_LAST_AFF
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        for k, v in self.params.iteritems():
            #print "k",k,"v",v
            self.params[k] = v.astype(dtype)

    def loss(self, X, y=None):
        """
        Evaluate loss and gradient for the three-layer convolutional network.
    
        Input / output: Same API as TwoLayerNet in fc_net.py.
        """
        W_AFF1, b_AFF1 = self.params['W_AFF1'], self.params['b_AFF1']
        W_LAST_AFF, b_LAST_AFF = self.params['W_LAST_AFF'], self.params['b_LAST_AFF']

        # pass conv_param to the forward pass for the convolutional layer
        # 2 conv layer share same conv_params and pool_params
        # may vary them later
        conv_params = []
        for i in range(0, self.conv_relu_pool_num):
            conv_params.append({'stride': 1, 'pad': (self.filter_sizes[i] - 1) / 2})

        # pass pool_param to the forward pass for the max-pooling layer
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the three-layer convolutional net,  #
        # computing the class scores for X and storing them in the scores          #
        # variable.                                                                #
        ############################################################################
        caches_conv_relu_pool = []
        input = X
        for i in range(0, self.conv_relu_pool_num):
            out_conv_relu_pool, cache_conv_relu_pool = conv_relu_pool_forward(input, self.params["W_CONV" + str(i+1)], self.params["b_CONV" + str(i+1)], conv_params[i], pool_param)
            input = out_conv_relu_pool
            caches_conv_relu_pool.append(cache_conv_relu_pool)
        # print "out_conv_relu_pool",out_conv_relu_pool.shape
        # print "W2",W2.shape
        # print "b2",b2.shape
        out_affine_relu_1, cache_affine_relu_1 = affine_relu_forward(out_conv_relu_pool, W_AFF1, b_AFF1)
        out, cache_affine_last = affine_forward(out_affine_relu_1, W_LAST_AFF, b_LAST_AFF)
        scores = out
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the three-layer convolutional net, #
        # storing the loss and gradients in the loss and grads variables. Compute  #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        ############################################################################
        loss, dx = softmax_loss(scores, y)
        regs_conv_relu_pool = 0.0
        for i in range(0, self.conv_relu_pool_num):
            regs_conv_relu_pool += 0.5 * self.reg * (np.sum(self.params["W_CONV" + str(i+1)]))
        loss += regs_conv_relu_pool + 0.5 * self.reg * (np.sum(W_AFF1 * W_AFF1) + np.sum(W_LAST_AFF * W_LAST_AFF))

        # gradients, cool
        dout_last_aff, dw_last_aff, db_last_aff = affine_backward(dx, cache_affine_last)
        grads['W_LAST_AFF'] = dw_last_aff + self.reg * W_LAST_AFF
        grads['b_LAST_AFF'] = db_last_aff

        dout_first_affine_relu_aff1, dw_aff1, db_aff1 = affine_relu_backward(dout_last_aff, cache_affine_relu_1)
        grads['W_AFF1'] = dw_aff1 + self.reg * W_AFF1
        grads['b_AFF1'] = db_aff1

        dout = dout_first_affine_relu_aff1
        for i in range(self.conv_relu_pool_num-1, -1, -1):
            dout, dw, db = conv_relu_pool_backward(dout, caches_conv_relu_pool[i])
            grads['W_CONV' + str(i+1)] = dw + self.reg * self.params["W_CONV" + str(i+1)]
            grads['b_CONV' + str(i+1)] = db
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads


# - [conv-relu-pool]XN - [affine]XM - [softmax or SVM]
class ThreeLayerConvNetWithConvReluPoolX2Naive2(object):
    """
    A three-layer convolutional network with the following architecture:

    [conv - relu - 2x2 max pool]*2 - affine - relu - affine - softmax

    The network operates on minibatches of data that have shape (N, C, H, W)
    consisting of N images, each with height H and width W and with C input
    channels.
    """

    def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
                 hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
                 dtype=np.float32):
        """
        Initialize a new network.

        Inputs:
        - input_dim: Tuple (C, H, W) giving size of input data
        - num_filters: Number of filters to use in the convolutional layer
        - filter_size: Size of filters to use in the convolutional layer
        - hidden_dim: Number of units to use in the fully-connected hidden layer
        - num_classes: Number of scores to produce from the final affine layer.
        - weight_scale: Scalar giving standard deviation for random initialization
          of weights.
        - reg: Scalar giving L2 regularization strength
        - dtype: numpy datatype to use for computation.
        """
        self.params = {}
        self.reg = reg
        self.dtype = dtype
        ############################################################################
        # TODO: Initialize weights and biases for the three-layer convolutional    #
        # network. Weights should be initialized from a Gaussian with standard     #
        # deviation equal to weight_scale; biases should be initialized to zero.   #
        # All weights and biases should be stored in the dictionary self.params.   #
        # Store weights and biases for the convolutional layer using the keys 'W1' #
        # and 'b1'; use keys 'W2' and 'b2' for the weights and biases of the       #
        # hidden affine layer, and keys 'W3' and 'b3' for the weights and biases   #
        # of the output affine layer.                                              #
        ############################################################################
        self.bn_params = []
        if self.use_batchnorm:
            self.bn_params = [{'mode': 'train'} for i in range(0,2)]



        C, H, W = input_dim

        # first conv layer init
        F_CONV1 = num_filters
        HH_CONV1 = filter_size
        WW_CONV1 = filter_size
        W_CONV1 = np.random.randn(F_CONV1, C, HH_CONV1, WW_CONV1) * weight_scale
        b_CONV1 = np.zeros(F_CONV1)

        # second conv layer init
        F_CONV2 = num_filters * 2
        HH_CONV2 = filter_size
        WW_CONV2 = filter_size
        W_CONV2 = np.random.randn(F_CONV2, F_CONV1, HH_CONV2, WW_CONV2) * weight_scale
        b_CONV2 = np.zeros(F_CONV2)

        # first affine layer
        H_AFF1 = hidden_dim
        W_AFF1 = np.random.randn(F_CONV2 * (H / 4) * (W / 4), H_AFF1) * weight_scale
        b_AFF1 = np.zeros(H_AFF1)
        # last affine layer
        H_LAST_AFF = num_classes
        W_LAST_AFF = np.random.randn(H_AFF1, H_LAST_AFF) * weight_scale
        b_LAST_AFF = np.zeros(H_LAST_AFF)

        # store them to self.prams for later use
        self.params['W_CONV1'] = W_CONV1
        self.params['b_CONV1'] = b_CONV1
        self.params['W_CONV2'] = W_CONV2
        self.params['b_CONV2'] = b_CONV2
        self.params['W_AFF1'] = W_AFF1
        self.params['b_AFF1'] = b_AFF1
        self.params['W_LAST_AFF'] = W_LAST_AFF
        self.params['b_LAST_AFF'] = b_LAST_AFF
        self.params["Gamma1"] = np.ones((1, F_CONV1))
        self.params["Beta1"] = np.zeros((1, F_CONV1))
        self.params["Gamma2"] = np.ones((1, F_CONV2))
        self.params["Beta2"] = np.zeros((1, F_CONV2))
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        for k, v in self.params.iteritems():
            # print "k",k,"v",v
            self.params[k] = v.astype(dtype)

    def loss(self, X, y=None):
        """
        Evaluate loss and gradient for the three-layer convolutional network.

        Input / output: Same API as TwoLayerNet in fc_net.py.
        """
        W_CONV1, b_CONV1 = self.params['W_CONV1'], self.params['b_CONV1']
        W_CONV2, b_CONV2 = self.params['W_CONV2'], self.params['b_CONV2']
        W_AFF1, b_AFF1 = self.params['W_AFF1'], self.params['b_AFF1']
        W_LAST_AFF, b_LAST_AFF = self.params['W_LAST_AFF'], self.params['b_LAST_AFF']
        Gamma1 = self.params["Gamma1"]
        Beta1 = self.params["Beta1"]
        Gamma2 = self.params["Gamma2"]
        Beta2 = self.params["Beta2"]
        # pass conv_param to the forward pass for the convolutional layer
        # 2 conv layer share same conv_params and pool_params
        # may vary them later
        filter_size = W_CONV1.shape[2]
        conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2}

        # pass pool_param to the forward pass for the max-pooling layer
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the three-layer convolutional net,  #
        # computing the class scores for X and storing them in the scores          #
        # variable.                                                                #
        ############################################################################
        out_conv_relu_pool_1, cache_conv_relu_pool_1 = conv_relu_pool_forward(X, W_CONV1, b_CONV1, conv_param,
                                                                              pool_param)
        out_conv_relu_pool_2, cache_conv_relu_pool_2 = conv_relu_pool_forward(out_conv_relu_pool_1, W_CONV2, b_CONV2,
                                                                              conv_param, pool_param)

        # print "out_conv_relu_pool",out_conv_relu_pool.shape
        # print "W2",W2.shape
        # print "b2",b2.shape
        out_affine_relu_1, cache_affine_relu_1 = affine_relu_forward(out_conv_relu_pool_2, W_AFF1, b_AFF1)
        out, cache_affine_last = affine_forward(out_affine_relu_1, W_LAST_AFF, b_LAST_AFF)
        scores = out
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the three-layer convolutional net, #
        # storing the loss and gradients in the loss and grads variables. Compute  #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        ############################################################################
        loss, dx = softmax_loss(scores, y)
        loss += 0.5 * self.reg * (
        np.sum(W_CONV1 * W_CONV1) + np.sum(W_CONV2 * W_CONV2) + np.sum(W_AFF1 * W_AFF1)) + np.sum(
            W_LAST_AFF * W_LAST_AFF)

        # gradients, cool
        dout_last_aff, dw_last_aff, db_last_aff = affine_backward(dx, cache_affine_last)
        grads['W_LAST_AFF'] = dw_last_aff + self.reg * W_LAST_AFF
        grads['b_LAST_AFF'] = db_last_aff

        dout_first_affine_relu_aff1, dw_aff1, db_aff1 = affine_relu_backward(dout_last_aff, cache_affine_relu_1)
        grads['W_AFF1'] = dw_aff1 + self.reg * W_AFF1
        grads['b_AFF1'] = db_aff1

        dout_conv2, dw_conv2, db_conv2 = conv_relu_pool_backward(dout_first_affine_relu_aff1, cache_conv_relu_pool_2)
        grads['W_CONV2'] = dw_conv2 + self.reg * W_CONV2
        grads['b_CONV2'] = db_conv2

        dout_conv1, dw_conv1, db_conv1 = conv_relu_pool_backward(dout_conv2, cache_conv_relu_pool_1)
        grads['W_CONV1'] = dw_conv1 + self.reg * W_CONV1
        grads['b_CONV1'] = db_conv1
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads

//last part
# Train a really good model on CIFAR-10
from cs231n.classifiers.convnet import *
from cs231n.classifiers.convnet_fancier import *
model = ThreeLayerConvNetWithConvReluPoolXNNaive(num_filters_list = [32,32], filter_sizes=[3,3], weight_scale=0.001, hidden_dim=500, reg=0.0005)
solver = Solver(model, data,
num_epochs=20, batch_size=100,
update_rule='adam',
optim_config={
'learning_rate': 1e-3,
},
verbose=True, print_every=20)
solver.train()

plt.subplot(2, 1, 1)
plt.plot(solver.train_acc_history)
plt.title('train_acc_history')

plt.subplot(2, 1, 2)
plt.plot(solver.val_acc_history)
plt.title('val_acc_history')
plt.show()


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

what's new today

each neuron serves as a linear classifier, each layer tries to transform the input data so that it's separable for the next layer
for 2 dimensional input, we need THREE neurons for the first layer to make it separable for the next layer


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

assignment1

SVM part:
linear_svm.py

import numpy as np
from random import shuffle

def svm_loss_naive(W, X, y, reg):
  """
  Structured SVM loss function, naive implementation (with loops).

  Inputs have dimension D, there are C classes, and we operate on minibatches
  of N examples.

  Inputs:
  - W: A numpy array of shape (D, C) containing weights.
  - X: A numpy array of shape (N, D) containing a minibatch of data.
  - y: A numpy array of shape (N,) containing training labels; y[i] = c means
    that X[i] has label c, where 0 <= c < C.
  - reg: (float) regularization strength

  Returns a tuple of:
  - loss as single float
  - gradient with respect to weights W; an array of same shape as W
  """
  dW = np.zeros(W.shape) # initialize the gradient as zero

  # compute the loss and the gradient
  num_classes = W.shape[1]
  num_train = X.shape[0]
  loss = 0.0
  for i in xrange(num_train):
    scores = X[i].dot(W)
    correct_class_score = scores[y[i]]
    for j in xrange(num_classes):
      if j == y[i]:
        continue
      margin = scores[j] - correct_class_score + 1 # note delta = 1
      if margin > 0:
        loss += margin
        dW[:,j] += X[i,:]
        dW[:,y[i]] -= X[i,:]
  # Right now the loss is a sum over all training examples, but we want it
  # to be an average instead so we divide by num_train.
  loss /= num_train

  # Add regularization to the loss.
  loss += 0.5 * reg * np.sum(W * W)
  dW /= num_train  
  dW += reg * W    
  #############################################################################
  # TODO:                                                                     #
  # Compute the gradient of the loss function and store it dW.                #
  # Rather that first computing the loss and then computing the derivative,   #
  # it may be simpler to compute the derivative at the same time that the     #
  # loss is being computed. As a result you may need to modify some of the    #
  # code above to compute the gradient.                                       #
  #############################################################################


  return loss, dW


def svm_loss_vectorized(W, X, y, reg):
  """
  Structured SVM loss function, vectorized implementation.

  Inputs and outputs are the same as svm_loss_naive.
  """
  loss = 0.0
  dW = np.zeros(W.shape) # initialize the gradient as zero

  #############################################################################
  # TODO:                                                                     #
  # Implement a vectorized version of the structured SVM loss, storing the    #
  # result in loss.                      `                                     #
  #############################################################################
  loss = 0.0
  num_classes = W.shape[1]
  num_train = X.shape[0]
  scores = np.dot(X, W)
  correct_class_scores = np.choose(y, scores.T)
  x1 = scores - correct_class_scores.reshape((num_train,1)) + 1
  x2 = x1[x1>0]
  x3 = np.sum(x2)
  loss = x3
  #print(x1)
  x1[x1>0]=1
  x1[x1<=0]=0
  #print(x1)
  mask1 = x1
  dw1 = np.dot(X.T,mask1)
  #print("dw1:",dw1)
  count_vec = np.sum(mask1,axis=1)
  #print(count_vec)
  mask2 = np.zeros(mask1.shape)
  mask2[np.arange(mask1.shape[0]),y] = 1
  #print("mask2",mask2)
  mask2 = mask2 * count_vec.reshape((num_train,1))
  #print("mask2:",mask2)
  #print("-mask2:",-mask2)
  dw2 = np.dot(X.T, -mask2)
  dW = dw1 + dw2
  #print("dw",dw1+dw2)
  dW/=num_train
  dW += reg * W
  # Right now the loss is a sum over all training examples, but we want it
  # to be an average instead so we divide by num_train.
  loss /= num_train

  # Add regularization to the loss.
  loss += 0.5 * reg * np.sum(W * W) - 1
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################


  #############################################################################
  # TODO:                                                                     #
  # Implement a vectorized version of the gradient for the structured SVM     #
  # loss, storing the result in dW.                                           #
  #                                                                           #
  # Hint: Instead of computing the gradient from scratch, it may be easier    #
  # to reuse some of the intermediate values that you used to compute the     #
  # loss.                                                                     #
  #############################################################################
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################

  return loss, dW

svm.ipynb


# Use the validation set to tune hyperparameters (regularization strength and
# learning rate). You should experiment with different ranges for the learning
# rates and regularization strengths; if you are careful you should be able to
# get a classification accuracy of about 0.4 on the validation set.
learning_rates = [1e-7, 5e-6]
regularization_strengths = [5e4, 8e4]
learning_rates = np.arange(1e-7,1e-6,1e-7)
regularization_strengths = np.arange(3e4, 8e4,1e4)
# results is dictionary mapping tuples of the form
# (learning_rate, regularization_strength) to tuples of the form
# (training_accuracy, validation_accuracy). The accuracy is simply the fraction
# of data points that are correctly classified.
results = {}
best_val = -1   # The highest validation accuracy that we have seen so far.
best_svm = None # The LinearSVM object that achieved the highest validation rate.

################################################################################
# TODO:                                                                        #
# Write code that chooses the best hyperparameters by tuning on the validation #
# set. For each combination of hyperparameters, train a linear SVM on the      #
# training set, compute its accuracy on the training and validation sets, and  #
# store these numbers in the results dictionary. In addition, store the best   #
# validation accuracy in best_val and the LinearSVM object that achieves this  #
# accuracy in best_svm.                                                        #
#                                                                              #
# Hint: You should use a small value for num_iters as you develop your         #
# validation code so that the SVMs don't take much time to train; once you are #
# confident that your validation code works, you should rerun the validation   #
# code with a larger value for num_iters.                                      #
################################################################################
total=len(learning_rates)*len(regularization_strengths)
i=0
for lr in learning_rates:
    for rs in regularization_strengths:
        i += 1
        print str(i) + "/" + str(total)
        svm = LinearSVM()
        loss_hist = svm.train(X_train, y_train, learning_rate=lr, reg=rs,
                      num_iters=1500, verbose=False)
        y_train_pred = svm.predict(X_train)
        accu_train = np.mean(y_train == y_train_pred)
        #print 'training accuracy: %f' % (accu_train)
        y_val_pred = svm.predict(X_val)
        accu_val = np.mean(y_val == y_val_pred) 
        #print 'validation accuracy: %f' % (accu_val)
        results[(lr, rs)] = (accu_train,accu_val)
        if accu_val > best_val:
            best_val = accu_val
            print "best_val",best_val,"lr:",lr,"rs:",rs
            best_svm = svm
################################################################################
#                              END OF YOUR CODE                                #
################################################################################
    
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print 'lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy)
    
print 'best validation accuracy achieved during cross-validation: %f' % best_val

KNN part
k_nearest_neighbor.py

import numpy as np

class KNearestNeighbor(object):
  """ a kNN classifier with L2 distance """

  def __init__(self):
    pass

  def train(self, X, y):
    """
    Train the classifier. For k-nearest neighbors this is just 
    memorizing the training data.

    Inputs:
    - X: A numpy array of shape (num_train, D) containing the training data
      consisting of num_train samples each of dimension D.
    - y: A numpy array of shape (N,) containing the training labels, where
         y[i] is the label for X[i].
    """
    self.X_train = X
    self.y_train = y
    
  def predict(self, X, k=1, num_loops=0):
    """
    Predict labels for test data using this classifier.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data consisting
         of num_test samples each of dimension D.
    - k: The number of nearest neighbors that vote for the predicted labels.
    - num_loops: Determines which implementation to use to compute distances
      between training points and testing points.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    if num_loops == 0:
      dists = self.compute_distances_no_loops(X)
    elif num_loops == 1:
      dists = self.compute_distances_one_loop(X)
    elif num_loops == 2:
      dists = self.compute_distances_two_loops(X)
    else:
      raise ValueError('Invalid value %d for num_loops' % num_loops)

    return self.predict_labels(dists, k=k)

  def compute_distances_two_loops(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a nested loop over both the training data and the 
    test data.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data.

    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      for j in xrange(num_train):
        #####################################################################
        # TODO:                                                             #
        # Compute the l2 distance between the ith test point and the jth    #
        # training point, and store the result in dists[i, j]. You should   #
        # not use a loop over dimension.                                    #
        #####################################################################
        dists[i, j] = np.sqrt(np.sum(np.square(X[i,:] - self.X_train[j,:])))
        #####################################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
    return dists

  def compute_distances_one_loop(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a single loop over the test data.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      #######################################################################
      # TODO:                                                               #
      # Compute the l2 distance between the ith test point and all training #
      # points, and store the result in dists[i, :].                        #
      #######################################################################
      x1 = np.square(X[i,:] - self.X_train)
      x2 = np.sum(x1, axis=1)
      x3 = np.sqrt(x2)
      dists[i,:] = x3                               
      #######################################################################
      #                         END OF YOUR CODE                            #
      #######################################################################
    return dists

  def compute_distances_no_loops(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train)) 
    
    #########################################################################
    # TODO:                                                                 #
    # Compute the l2 distance between all test points and all training      #
    # points without using any explicit loops, and store the result in      #
    # dists.                                                                #
    #                                                                       #
    # You should implement this function using only basic array operations; #
    # in particular you should not use functions from scipy.                #
    #                                                                       #
    # HINT: Try to formulate the l2 distance using matrix multiplication    #
    #       and two broadcast sums.                                         #
    #########################################################################
    x1 = np.dot(X, np.transpose(self.X_train))
    x2 = np.sum(X*X, axis=1).reshape((num_test,1))
    x3 = np.sum(self.X_train*self.X_train, axis=1)
    x4 = x2-2*x1
    x5 = x4 + x3
    dists = np.sqrt(x5)
    #########################################################################
    #                         END OF YOUR CODE                              #
    #########################################################################
    return dists

  def predict_labels(self, dists, k=1):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.

    Inputs:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    num_test = dists.shape[0]
    y_pred = np.zeros(num_test)
    for i in xrange(num_test):
      # A list of length k storing the labels of the k nearest neighbors to
      # the ith test point.
      closest_y = []
      #########################################################################
      # TODO:                                                                 #
      # Use the distance matrix to find the k nearest neighbors of the ith    #
      # testing point, and use o to find the labels of these       #
      # neighbors. Store these labels in closest_y.                           #
      # Hint: Look up the function numpy.argsort.                             #
      #########################################################################
      indexes = np.argsort(dists[i,:])
      for j in range(0, k):
            closest_y.append(indexes[j])
      #########################################################################
      # TODO:                                                                 #
      # Now that you have found the labels of the k nearest neighbors, you    #
      # need to find the most common label in the list closest_y of labels.   #
      # Store this label in y_pred[i]. Break ties by choosing the smaller     #
      # label.                                                                #
      #########################################################################
      counts = np.bincount(self.y_train[closest_y])
      y_pred[i] = np.argmax(counts)    
      #########################################################################
      #                           END OF YOUR CODE                            # 
      #########################################################################

    return y_pred

knn.ipynb
cross-validation part

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
################################################################################
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
indexes = np.arange(X_train.shape[0])
folds_index = np.array_split(indexes, num_folds)
for index in folds_index:
    X_train_folds.append(X_train[index])
    y_train_folds.append(y_train[index])
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}


################################################################################
# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of k in the k_to_accuracies dictionary.                               #
################################################################################
for k in  k_choices:
    accu = []
    for i in range(0, num_folds):
        t = []
        y = []
        for j in range(0, num_folds):
            if j != i:
                t.append(X_train_folds[j])
                y.append(y_train_folds[j])
        tt = np.concatenate(t)
        yy = np.concatenate(y)
        te = X_train_folds[i]
        classifier = KNearestNeighbor()
        classifier.train(tt, yy)
        dists = classifier.compute_distances_no_loops(te)
        y_test_pred = classifier.predict_labels(dists, k)
        num_correct = np.sum(y_test_pred == y_train_folds[i])
        accuracy = float(num_correct) / num_test
        accu.append(accuracy)
    k_to_accuracies[k] = accu
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# Print out the computed accuracies
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)

linear_classifier.py

import numpy as np
from cs231n.classifiers.linear_svm import *
from cs231n.classifiers.softmax import *

class LinearClassifier(object):

  def __init__(self):
    self.W = None

  def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
            batch_size=200, verbose=False):
    """
    Train this linear classifier using stochastic gradient descent.

    Inputs:
    - X: A numpy array of shape (N, D) containing training data; there are N
      training samples each of dimension D.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c
      means that X[i] has label 0 <= c < C for C classes.
    - learning_rate: (float) learning rate for optimization.
    - reg: (float) regularization strength.
    - num_iters: (integer) number of steps to take when optimizing
    - batch_size: (integer) number of training examples to use at each step.
    - verbose: (boolean) If true, print progress during optimization.

    Outputs:
    A list containing the value of the loss function at each training iteration.
    """
    num_train, dim = X.shape
    num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
    if self.W is None:
      # lazily initialize W
      self.W = 0.001 * np.random.randn(dim, num_classes)

    # Run stochastic gradient descent to optimize W
    loss_history = []
    for it in xrange(num_iters):
      X_batch = None
      y_batch = None

      #########################################################################
      # TODO:                                                                 #
      # Sample batch_size elements from the training data and their           #
      # corresponding labels to use in this round of gradient descent.        #
      # Store the data in X_batch and their corresponding labels in           #
      # y_batch; after sampling X_batch should have shape (dim, batch_size)   #
      # and y_batch should have shape (batch_size,)                           #
      #                                                                       #
      # Hint: Use np.random.choice to generate indices. Sampling with         #
      # replacement is faster than sampling without replacement.              #
      #########################################################################
      indexes = np.random.choice(X.shape[0],batch_size,replace=True)
      X_batch = X[indexes]
      y_batch = y[indexes]
      #########################################################################
      #                       END OF YOUR CODE                                #
      #########################################################################

      # evaluate loss and gradient
      loss, grad = self.loss(X_batch, y_batch, reg)
      loss_history.append(loss)

      # perform parameter update
      #########################################################################
      # TODO:                                                                 #
      # Update the weights using the gradient and the learning rate.          #
      #########################################################################
      self.W -= learning_rate * grad
      #########################################################################
      #                       END OF YOUR CODE                                #
      #########################################################################

      if verbose and it % 100 == 0:
        print 'iteration %d / %d: loss %f' % (it, num_iters, loss)

    return loss_history

  def predict(self, X):
    """
    Use the trained weights of this linear classifier to predict labels for
    data points.

    Inputs:
    - X: D x N array of training data. Each column is a D-dimensional point.

    Returns:
    - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
      array of length N, and each element is an integer giving the predicted
      class.
    """
    y_pred = np.zeros(X.shape[1])
    ###########################################################################
    # TODO:                                                                   #
    # Implement this method. Store the predicted labels in y_pred.            #
    ###########################################################################
    y_proba = np.dot(X, self.W)
    y_pred = np.argmax(y_proba, axis=1)
    ###########################################################################
    #                           END OF YOUR CODE                              #
    ###########################################################################
    return y_pred
  
  def loss(self, X_batch, y_batch, reg):
    """
    Compute the loss function and its derivative. 
    Subclasses will override this.

    Inputs:
    - X_batch: A numpy array of shape (N, D) containing a minibatch of N
      data points; each point has dimension D.
    - y_batch: A numpy array of shape (N,) containing labels for the minibatch.
    - reg: (float) regularization strength.

    Returns: A tuple containing:
    - loss as a single float
    - gradient with respect to self.W; an array of the same shape as W
    """
    pass


class LinearSVM(LinearClassifier):
  """ A subclass that uses the Multiclass SVM loss function """

  def loss(self, X_batch, y_batch, reg):
    return svm_loss_vectorized(self.W, X_batch, y_batch, reg)


class Softmax(LinearClassifier):
  """ A subclass that uses the Softmax + Cross-entropy loss function """

  def loss(self, X_batch, y_batch, reg):
    return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)

Softmax part
softmax.py

import numpy as np
from random import shuffle

def softmax_loss_naive(W, X, y, reg):
  """
  Softmax loss function, naive implementation (with loops)

  Inputs have dimension D, there are C classes, and we operate on minibatches
  of N examples.

  Inputs:
  - W: A numpy array of shape (D, C) containing weights.
  - X: A numpy array of shape (N, D) containing a minibatch of data.
  - y: A numpy array of shape (N,) containing training labels; y[i] = c means
    that X[i] has label c, where 0 <= c < C.
  - reg: (float) regularization strength

  Returns a tuple of:
  - loss as single float
  - gradient with respect to weights W; an array of same shape as W
  """
  # Initialize the loss and gradient to zero.
  loss = 0.0
  dW = np.zeros_like(W)
  num_train = X.shape[0]
  num_class = W.shape[1]
  num_dim = X.shape[1]

  #############################################################################
  # TODO: Compute the softmax loss and its gradient using explicit loops.     #
  # Store the loss in loss and the gradient in dW. If you are not careful     #
  # here, it is easy to run into numeric instability. Don't forget the        #
  # regularization!                                                           #
  #############################################################################
  for i in range(0, num_train):
      fs = np.dot(X[i,:], W)
      max_fs = max(fs)
      fs -= max_fs
      f_correct = fs[y[i]]
      loss += -np.log(np.exp(f_correct)  / np.sum(np.exp(fs)))
     
        
      p1 = np.exp(fs)/np.sum(np.exp(fs))
      correct_y = np.zeros(num_class)
      correct_y[y[i]] = 1
      d = p1 - correct_y
      dW += X[i,:].reshape((num_dim,1)) * d
  loss /= num_train
  loss += 0.5 * reg * np.sum(W * W)
  dW /= num_train   
  dW += reg * W   
  #############################################################################
  #                          END OF YOUR CODE                                 #
  #############################################################################

  return loss, dW


def softmax_loss_vectorized(W, X, y, reg):
  """
  Softmax loss function, vectorized version.

  Inputs and outputs are the same as softmax_loss_naive.
  """
  # Initialize the loss and gradient to zero.
  loss = 0.0
  dW = np.zeros_like(W)
  num_train = X.shape[0]
  num_class = W.shape[1]
  num_dim = X.shape[1]

  XW = np.dot(X, W)
  max_per_row = np.max(XW, axis=1)
  XW -= max_per_row.reshape((num_train,1))
  dominator = np.sum(np.exp(XW), axis=1)
  XW_y = np.choose(y, XW.T)
  numerator = np.exp(XW_y)
  
  p_matrix = np.exp(XW)/np.sum(np.exp(XW),axis=1).reshape((num_train),1)
  y_matrix = np.zeros((num_train, num_class))
  y_matrix[np.arange(y_matrix.shape[0]), y] = 1
  d = p_matrix - y_matrix
  dW = np.dot(X.T,d)    
  loss = np.sum(-np.log(numerator/dominator), axis = 0 )
  loss /= num_train
  loss += 0.5 * reg * np.sum(W * W)
  dW /= num_train   
  dW += reg * W      
  #############################################################################
  # TODO: Compute the softmax loss and its gradient using no explicit loops.  #
  # Store the loss in loss and the gradient in dW. If you are not careful     #
  # here, it is easy to run into numeric instability. Don't forget the        #
  # regularization!                                                           #
  #############################################################################
  
  #############################################################################
  #                          END OF YOUR CODE                                 #
  #############################################################################

  return loss, dW

softmax.ipynb

# Use the validation set to tune hyperparameters (regularization strength and
# learning rate). You should experiment with different ranges for the learning
# rates and regularization strengths; if you are careful you should be able to
# get a classification accuracy of over 0.35 on the validation set.
from cs231n.classifiers import Softmax
results = {}
best_val = -1
best_softmax = None
learning_rates = [1e-7, 5e-7]
regularization_strengths = [5e4, 1e8]
learning_rates = np.arange(1e-7,1e-6,1e-7)
regularization_strengths = np.arange(3e4, 8e4,1e4)
################################################################################
# TODO:                                                                        #
# Use the validation set to set the learning rate and regularization strength. #
# This should be identical to the validation that you did for the SVM; save    #
# the best trained softmax classifer in best_softmax.                          #
################################################################################
total=len(learning_rates)*len(regularization_strengths)
i=0
for lr in learning_rates:
    for rs in regularization_strengths:
        i += 1
        print str(i) + "/" + str(total)
        softmax = Softmax()
        loss_hist = softmax.train(X_train, y_train, learning_rate=lr, reg=rs,
                      num_iters=1500, verbose=False)
        y_train_pred = softmax.predict(X_train)
        accu_train = np.mean(y_train == y_train_pred)
        #print 'training accuracy: %f' % (accu_train)
        y_val_pred = softmax.predict(X_val)
        accu_val = np.mean(y_val == y_val_pred) 
        #print 'validation accuracy: %f' % (accu_val)
        results[(lr, rs)] = (accu_train,accu_val)
        if accu_val > best_val:
            best_val = accu_val
            print "best_val",best_val,"lr:",lr,"rs:",rs
            best_softmax = softmax
################################################################################
#                              END OF YOUR CODE                                #
################################################################################
    
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print 'lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy)
    
print 'best validation accuracy achieved during cross-validation: %f' % best_val

2 layer network
neural_net.py

import numpy as np
import matplotlib.pyplot as plt


class TwoLayerNet(object):
  """
  A two-layer fully-connected neural network. The net has an input dimension of
  N, a hidden layer dimension of H, and performs classification over C classes.
  We train the network with a softmax loss function and L2 regularization on the
  weight matrices. The network uses a ReLU nonlinearity after the first fully
  connected layer.

  In other words, the network has the following architecture:

  input - fully connected layer - ReLU - fully connected layer - softmax

  The outputs of the second fully-connected layer are the scores for each class.
  """

  def __init__(self, input_size, hidden_size, output_size, std=1e-4):
    """
    Initialize the model. Weights are initialized to small random values and
    biases are initialized to zero. Weights and biases are stored in the
    variable self.params, which is a dictionary with the following keys:

    W1: First layer weights; has shape (D, H)
    b1: First layer biases; has shape (H,)
    W2: Second layer weights; has shape (H, C)
    b2: Second layer biases; has shape (C,)

    Inputs:
    - input_size: The dimension D of the input data.
    - hidden_size: The number of neurons H in the hidden layer.
    - output_size: The number of classes C.
    """
    self.params = {}
    self.params['W1'] = std * np.random.randn(input_size, hidden_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['W2'] = std * np.random.randn(hidden_size, output_size)
    self.params['b2'] = np.zeros(output_size)

  def relu(self, x):
    r = np.copy(x)
    r[r<0] = 0
    return r 

  def loss(self, X, y=None, reg=0.0):
    """
    Compute the loss and gradients for a two layer fully connected neural
    network.

    Inputs:
    - X: Input data of shape (N, D). Each X[i] is a training sample.
    - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
      an integer in the range 0 <= y[i] < C. This parameter is optional; if it
      is not passed then we only return scores, and if it is passed then we
      instead return the loss and gradients.
    - reg: Regularization strength.

    Returns:
    If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
    the score for class c on input X[i].

    If y is not None, instead return a tuple of:
    - loss: Loss (data loss and regularization loss) for this batch of training
      samples.
    - grads: Dictionary mapping parameter names to gradients of those parameters
      with respect to the loss function; has the same keys as self.params.
    """
    # Unpack variables from the params dictionary
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']
    N, D = X.shape

    # Compute the forward pass
    scores = None
    #############################################################################
    # TODO: Perform the forward pass, computing the class scores for the input. #
    # Store the result in the scores variable, which should be an array of      #
    # shape (N, C).                                                             #
    #############################################################################
    XW1 = np.dot(X,W1) + b1
    output_h1 = self.relu(XW1)
    output = np.dot(output_h1, W2) + b2
    scores = output
    
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################
    
    # If the targets are not given then jump out, we're done
    if y is None:
      return scores

    # Compute the loss
    loss = None
    #############################################################################
    # TODO: Finish the forward pass, and compute the loss. This should include  #
    # both the data loss and L2 regularization for W1 and W2. Store the result  #
    # in the variable loss, which should be a scalar. Use the Softmax           #
    # classifier loss. So that your results match ours, multiply the            #
    # regularization loss by 0.5                                                #
    #############################################################################
    XW = scores
    max_per_row = np.max(XW, axis=1)
    XW -= max_per_row.reshape((N,1))
    dominator = np.sum(np.exp(XW), axis=1)
    XW_y = np.choose(y, XW.T)
    numerator = np.exp(XW_y)
    loss = np.sum(-np.log(numerator/dominator), axis = 0 )
    loss /= N
    loss += 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

    # Backward pass: compute gradients
    C = W2.shape[1]
    grads = {}
    yy = np.zeros((N,C))
    yy[np.arange(yy.shape[0]),y]=1
    p = np.exp(XW)/np.sum(np.exp(XW),axis=1).reshape((N),1)
    p_y = p - yy
    db2 = np.sum(p_y,axis=0) / N
    grads['b2'] = db2
    
    grads['W2'] = output_h1.T.dot(p_y)/N+reg*W2
    drelu = p_y.dot(W2.T)
    XW1Prime = np.copy(XW1)
    XW1Prime[XW1Prime<0]=0
    XW1Prime[XW1Prime>0]=1
    dreluprime = XW1Prime * drelu
    
    grads['b1'] = np.sum(dreluprime,axis=0)/N
    
    grads['W1'] = X.T.dot(dreluprime)/N + reg*W1
    #############################################################################
    # TODO: Compute the backward pass, computing the derivatives of the weights #
    # and biases. Store the results in the grads dictionary. For example,       #
    # grads['W1'] should store the gradient on W1, and be a matrix of same size #
    #############################################################################
    pass
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

    return loss, grads

  def train(self, X, y, X_val, y_val,
            learning_rate=1e-3, learning_rate_decay=0.95,
            reg=1e-5, num_iters=100,
            batch_size=200, verbose=False):
    """
    Train this neural network using stochastic gradient descent.

    Inputs:
    - X: A numpy array of shape (N, D) giving training data.
    - y: A numpy array f shape (N,) giving training labels; y[i] = c means that
      X[i] has label c, where 0 <= c < C.
    - X_val: A numpy array of shape (N_val, D) giving validation data.
    - y_val: A numpy array of shape (N_val,) giving validation labels.
    - learning_rate: Scalar giving learning rate for optimization.
    - learning_rate_decay: Scalar giving factor used to decay the learning rate
      after each epoch.
    - reg: Scalar giving regularization strength.
    - num_iters: Number of steps to take when optimizing.
    - batch_size: Number of training examples to use per step.
    - verbose: boolean; if true print progress during optimization.
    """
    num_train = X.shape[0]
    iterations_per_epoch = max(num_train / batch_size, 1)

    # Use SGD to optimize the parameters in self.model
    loss_history = []
    train_acc_history = []
    val_acc_history = []

    for it in xrange(num_iters):
      X_batch = None
      y_batch = None

      #########################################################################
      # TODO: Create a random minibatch of training data and labels, storing  #
      # them in X_batch and y_batch respectively.                             #
      #########################################################################
      indexes = np.random.choice(X.shape[0],batch_size,replace=True)
      X_batch = X[indexes]
      y_batch = y[indexes]
      #########################################################################
      #                             END OF YOUR CODE                          #
      #########################################################################

      # Compute loss and gradients using the current minibatch
      loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
      loss_history.append(loss)

      #########################################################################
      # TODO: Use the gradients in the grads dictionary to update the         #
      # parameters of the network (stored in the dictionary self.params)      #
      # using stochastic gradient descent. You'll need to use the gradients   #
      # stored in the grads dictionary defined above.                         #
      #########################################################################
      self.params['W1'] -= learning_rate * grads['W1']
      self.params['b1'] -= learning_rate * grads['b1']
      self.params['W2'] -= learning_rate * grads['W2']
      self.params['b2'] -= learning_rate * grads['b2']
      #########################################################################
      #                             END OF YOUR CODE                          #
      #########################################################################

      if verbose and it % 100 == 0:
        print 'iteration %d / %d: loss %f' % (it, num_iters, loss)

      # Every epoch, check train and val accuracy and decay learning rate.
      if it % iterations_per_epoch == 0:
        # Check accuracy
        train_acc = (self.predict(X_batch) == y_batch).mean()
        val_acc = (self.predict(X_val) == y_val).mean()
        train_acc_history.append(train_acc)
        val_acc_history.append(val_acc)

        # Decay learning rate
        learning_rate *= learning_rate_decay

    return {
      'loss_history': loss_history,
      'train_acc_history': train_acc_history,
      'val_acc_history': val_acc_history,
    }

  def predict(self, X):
    """
    Use the trained weights of this two-layer network to predict labels for
    data points. For each data point we predict scores for each of the C
    classes, and assign each data point to the class with the highest score.

    Inputs:
    - X: A numpy array of shape (N, D) giving N D-dimensional data points to
      classify.

    Returns:
    - y_pred: A numpy array of shape (N,) giving predicted labels for each of
      the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
      to have class c, where 0 <= c < C.
    """
    y_pred = None

    ###########################################################################
    # TODO: Implement this function; it should be VERY simple!                #
    ###########################################################################
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']
    N, D = X.shape
    XW1 = np.dot(X,W1) + b1
    output_h1 = self.relu(XW1)
    output = np.dot(output_h1, W2) + b2
    scores = output
    XW = scores
    max_per_row = np.max(XW, axis=1)
    XW -= max_per_row.reshape((N,1))
    p = np.exp(XW)/np.sum(np.exp(XW),axis=1).reshape((N),1)
    y_pred = np.argmax(p,axis=1)
    ###########################################################################
    #                              END OF YOUR CODE                           #
    ###########################################################################

    return y_pred

cross-validation part
two_layer_net.ipynb

best_net = None # store the best model into this 
best_val_acc = 0
#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_net.                                                            #
#                                                                               #
# To help debug your network, it may help to use visualizations similar to the  #
# ones we used above; these visualizations will have significant qualitative    #
# differences from the ones we saw above for the poorly tuned network.          #
#                                                                               #
# Tweaking hyperparameters by hand can be fun, but you might find it useful to  #
# write code to sweep through possible combinations of hyperparameters          #
# automatically like we did on the previous exercises.                          #
#################################################################################
input_size = 32 * 32 * 3
num_classes = 10
hidden_sizes = np.arange(50,500,50)
for hidden_size in hidden_sizes:
    net = TwoLayerNet(input_size, hidden_size, num_classes)

    # Train the network
    stats = net.train(X_train, y_train, X_val, y_val,
                num_iters=1000, batch_size=200,
                learning_rate=1e-3, learning_rate_decay=0.95,
                reg=0.5, verbose=True)

    # Predict on the validation set
    val_acc = (net.predict(X_val) == y_val).mean()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_net = net
        print 'best validation accuracy: ', val_acc
# Use the validation set to tune the learning rate and regularization strength

from cs231n.classifiers.linear_classifier import LinearSVM

learning_rates = [1e-9, 1e-8, 1e-7]
regularization_strengths = [1e5, 1e6, 1e7]
learning_rates = np.arange(1e-7,1e-6,1e-7)
regularization_strengths = np.arange(3e4, 8e4,1e4)
results = {}
best_val = -1
best_svm = None
i=0
total=len(learning_rates)*len(regularization_strengths)
print X_train_feats.shape
pass
################################################################################
# TODO:                                                                        #
# Use the validation set to set the learning rate and regularization strength. #
# This should be identical to the validation that you did for the SVM; save    #
# the best trained classifer in best_svm. You might also want to play          #
# with different numbers of bins in the color histogram. If you are careful    #
# you should be able to get accuracy of near 0.44 on the validation set.       #
################################################################################
for lr in learning_rates:
    for rs in regularization_strengths:
        i += 1
        print str(i) + "/" + str(total)
        svm = LinearSVM()
        loss_hist = svm.train(X_train_feats, y_train, learning_rate=lr, reg=rs,
                      num_iters=1500, verbose=False)
        y_train_pred = svm.predict(X_train_feats)
        accu_train = np.mean(y_train == y_train_pred)
        #print 'training accuracy: %f' % (accu_train)
        y_val_pred = svm.predict(X_val_feats)
        accu_val = np.mean(y_val == y_val_pred) 
        #print 'validation accuracy: %f' % (accu_val)
        results[(lr, rs)] = (accu_train,accu_val)
        if accu_val > best_val:
            best_val = accu_val
            print "best_val",best_val,"lr:",lr,"rs:",rs
            best_svm = svm
################################################################################
#                              END OF YOUR CODE                                #
################################################################################

# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print 'lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy)
    
print 'best validation accuracy achieved during cross-validation: %f' % best_val

from cs231n.classifiers.neural_net import TwoLayerNet
num_training=49000
num_validation=1000
num_test=1000
X_train = X_train.reshape(num_training, -1)
X_val = X_val.reshape(num_validation, -1)
X_test = X_test.reshape(num_test, -1)
input_dim = X_train_feats.shape[1]
hidden_dim = 500
num_classes = 10
best_net = None

################################################################################
# TODO: Train a two-layer neural network on image features. You may want to #
# cross-validate various parameters as in previous sections. Store your best #
# model in the best_net variable. #
################################################################################
input_size = 32 * 32 * 3
num_classes = 10

#for hidden_size in hidden_sizes:
max_count = 100
best_val_acc = 0
for i in xrange(max_count):
#hidden_size = np.random.uniform(50,500)
reg = 10 ** np.random.uniform(-4,0)
lr = 10 ** np.random.uniform(-1,-0)

net = TwoLayerNet(input_dim, hidden_dim, num_classes)
# Train the network
stats = net.train(X_train_feats, y_train, X_val_feats, y_val,
num_iters=1000, batch_size=200,
learning_rate=0.1, learning_rate_decay=0.95,
reg=reg, verbose=False)

# Predict on the validation set
val_acc = (net.predict(X_val_feats) == y_val).mean()
print "(",i,"/",max_count,")","reg:", round(reg,5),"lr", round(lr,5), "val_acc", val_acc
if val_acc > best_val_acc:
best_val_acc = val_acc
best_net = net
print '[Cool]best validation accuracy: ', val_acc
################################################################################
# END OF YOUR CODE #
################################################################################





xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

cs231 assignments wait to upload to git

import numpy as np

class KNearestNeighbor(object):
  """ a kNN classifier with L2 distance """

  def __init__(self):
    pass

  def train(self, X, y):
    """
    Train the classifier. For k-nearest neighbors this is just 
    memorizing the training data.

    Inputs:
    - X: A numpy array of shape (num_train, D) containing the training data
      consisting of num_train samples each of dimension D.
    - y: A numpy array of shape (N,) containing the training labels, where
         y[i] is the label for X[i].
    """
    self.X_train = X
    self.y_train = y
    
  def predict(self, X, k=1, num_loops=0):
    """
    Predict labels for test data using this classifier.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data consisting
         of num_test samples each of dimension D.
    - k: The number of nearest neighbors that vote for the predicted labels.
    - num_loops: Determines which implementation to use to compute distances
      between training points and testing points.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    if num_loops == 0:
      dists = self.compute_distances_no_loops(X)
    elif num_loops == 1:
      dists = self.compute_distances_one_loop(X)
    elif num_loops == 2:
      dists = self.compute_distances_two_loops(X)
    else:
      raise ValueError('Invalid value %d for num_loops' % num_loops)

    return self.predict_labels(dists, k=k)

  def compute_distances_two_loops(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a nested loop over both the training data and the 
    test data.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data.

    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    """
    print "hello world"
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      for j in xrange(num_train):
        #####################################################################
        # TODO:                                                             #
        # Compute the l2 distance between the ith test point and the jth    #
        # training point, and store the result in dists[i, j]. You should   #
        # not use a loop over dimension.                                    #
        #####################################################################
        dists[i, j] = np.sqrt(np.square(num_test[i,:] - num_train[j,:]))
        #####################################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
    return dists

  def compute_distances_one_loop(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using a single loop over the test data.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      #######################################################################
      # TODO:                                                               #
      # Compute the l2 distance between the ith test point and all training #
      # points, and store the result in dists[i, :].                        #
      #######################################################################
      pass
      #######################################################################
      #                         END OF YOUR CODE                            #
      #######################################################################
    return dists

  def compute_distances_no_loops(self, X):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train)) 
    #########################################################################
    # TODO:                                                                 #
    # Compute the l2 distance between all test points and all training      #
    # points without using any explicit loops, and store the result in      #
    # dists.                                                                #
    #                                                                       #
    # You should implement this function using only basic array operations; #
    # in particular you should not use functions from scipy.                #
    #                                                                       #
    # HINT: Try to formulate the l2 distance using matrix multiplication    #
    #       and two broadcast sums.                                         #
    #########################################################################
    pass
    #########################################################################
    #                         END OF YOUR CODE                              #
    #########################################################################
    return dists

  def predict_labels(self, dists, k=1):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.

    Inputs:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.

    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    num_test = dists.shape[0]
    y_pred = np.zeros(num_test)
    for i in xrange(num_test):
      # A list of length k storing the labels of the k nearest neighbors to
      # the ith test point.
      closest_y = []
      #########################################################################
      # TODO:                                                                 #
      # Use the distance matrix to find the k nearest neighbors of the ith    #
      # testing point, and use self.y_train to find the labels of these       #
      # neighbors. Store these labels in closest_y.                           #
      # Hint: Look up the function numpy.argsort.                             #
      #########################################################################
      pass
      #########################################################################
      # TODO:                                                                 #
      # Now that you have found the labels of the k nearest neighbors, you    #
      # need to find the most common label in the list closest_y of labels.   #
      # Store this label in y_pred[i]. Break ties by choosing the smaller     #
      # label.                                                                #
      #########################################################################
      pass
      #########################################################################
      #                           END OF YOUR CODE                            # 
      #########################################################################

    return y_pred




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

mini batches

Minibatches has the following advantages:
a) more efficient
The standard error of the mean estimated from n samples is \frac{\sigma}{\sqrt{n}}, so there are less linear returns in using growing sample size.

b) data redundancy
To the extreme, the data contains n copy of one data, in this case minibatch of size 1 works just as well as the full data does. Generally, minibatch can reduce the inefficiency brought by data redundancy

Minibatch size driven factors:
a) Larger size provides more accuracy and less variance, but with less linear return

b) No less than certain threshold as multicore architecture may not sense the difference any more

c) Better to be power of 2 for adapting for GPU's architecture

d) Must be fit in the memory

e) Small batches provide more generalization of errors due to the noise added

Different algos react to minibatch differently:
a) If updating formula involves only g, 100 batch size is ok
b) If it involves second derivative, e.g. H^{-1}g, 10000 batch size is required to minimize the variance introduced by small size

Randomly selected:
Minibatches should be selected randomly, not only in one minibatch, but also between minibatches. Practically, shuffle the data and store the shuffled indexes work well.

Reduction of generalized error
When multiple epochs are used,only the first epoch follows the unbiased gradient of the generalization error, the additional epochs usually provide enough benefit due to decreased training error to offset the harm they cause by increasing the gap between training error and test error.

Q1: epoch, minibatch, batch?
epoch: your algorithm sees the full data once
minibatch: a subset of your full data
batch: your full data

Example:
full data size: n
a) minibatch size: \frac{n}{2}
in each epoch, your parameter is updated twice
b) minibatch size: n
each epoch, your parameter is updated once

Since the updates happen more frequently in a) than in b), it's highly possible that it finishes sooner than b) with same accuracy.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

spark ml

Parameters: algos take parameters, ml algos must implement Params to have parameters. .

Params:
CDS(core data structure): two maps of type ParamMap.
CO(core operation): set/get for CDS above, a copy method

trait Params extends Identifiable with Serializable {

  /** Internal param map for user-supplied values. */
  private val paramMap: ParamMap = ParamMap.empty

  /** Internal param map for default values. */
  private val defaultParamMap: ParamMap = ParamMap.empty

  /**
   * Sets a parameter in the embedded param map.
   */
  protected final def set[T](param: Param[T], value: T): this.type

  /**
   * Optionally returns the user-supplied value of a param.
   */
  final def get[T](param: Param[T]): Option[T]

  /**
   * Sets a default value for a param.
   */
  protected final def setDefault[T](param: Param[T], value: T): this.type
  
  /**
   * Gets the default value of a parameter.
   */
  final def getDefault[T](param: Param[T]): Option[T]
  
  /**
   * Creates a copy of this instance with the same UID and some extra params.
   * 
  def copy(extra: ParamMap): Params
}

ParamMap:
CDS:a mutable map with key of type Param[Any] and value of type Any.
CO: put/get for CDS above

final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) {
  def put[T](param: Param[T], value: T): this.type = put(param -> value)
  def get[T](param: Param[T]): Option[T]
}

Param[T]: very much like a case class, with properties

class Param[T](val parent: String, val name: String, val doc: String, val isValid: T => Boolean)

A concrete example: user defined class + HasInputCol

HasInputCol: a meaningful parameter trait only need to define its own Param[T],
CDS:inputCol: Param[String]
CO:getInputCol(), this is not necessary though, just a facility

/**
 * Trait for shared param inputCol.
 */
private[ml] trait HasInputCol extends Params {

  /**
   * Param for input column name.
   * @group param
   */
  final val inputCol: Param[String] = new Param[String](this, "inputCol", "input column name")

  /** @group getParam */
  final def getInputCol: String = $(inputCol)
}

AlgoWithParameter: if your algo needs input column, just extends HasInputColm. setInputCol(value:String) is not mandatory, you can use set(key:Param[String], value:String) defined in superclass Params instead.

class AlgoWithParameter extends HasInputCol {
  override def setInputCol(value:String) = ?
}

It's cool, but where and when those parameters are validated?

case class ParamPair[T](param: Param[T], value: T) {
  // This is *the* place Param.validate is called.  Whenever a parameter is specified, we should
  // always construct a ParamPair so that validate is called.
  param.validate(value)
}

No, only ParamPair can be validated? We didn't construct it explicitly yet. Let's look at set method in Params:

/**
  * Sets a parameter in the embedded param map.
  */
protected final def set[T](param: Param[T], value: T): this.type = {
  set(param -> value)
}

And the inner set:

/**
  * Sets a parameter in the embedded param map.
  */
protected final def set(paramPair: ParamPair[_]): this.type = {
  shouldOwn(paramPair.param)
  paramMap.put(paramPair)
  this
}

OK, seems we do construct a ParamPair but implicitly:

/** Creates a param pair with the given value (for Scala). */
def ->(value: T): ParamPair[T] = ParamPair(this, value)

Cool.

MetaData: every data comes with a description of it, it gives a general idea of what the data looks like, in spark ml, we use StructType to represent it.

StructType: notion borrowed from spark sql, giving information about the data

case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {

} 




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

Geometric interpretation of Ax and Eigen-decomposition of a diagonal matrix

Geometric interpretation of Ax:
Suppose A is a 2 \times 2 matrix, \lambda_1 and \lambda_2 are two eigenvalues and v_1 and v_2 are corresponding eigenvectors.
Since v_1 and v_2 are independent, it forms a base of 2 dimensional space, so x = a_1v_1 + a_2v_2.

Ax = A(a_1v_1 + a_2v_2)=a_1 \lambda_1 v_1 + a_2 \lambda_2 v_2=\lambda_1 a_1 v_1+\lambda_2 a_2 v_2


So we can conclude that x is scaled in two direction v_1 and v_2 with magnitude \lambda_1 and \lambda_2.
Cool.

Eigen-decomposition of a diagonal matrix:

D=IDI^{-1}

, so the eigenvectors are u_1, u_2 ...
Cool.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

Visualizing multivariable functions

This article is based on a khanacademy serie of articles.

A function is called multivariable function if it takes multiple numbers as inputs and/or split multiple numbers as output.

A serial of numbers can be seen as a point in a multiple dimension space. For example, (1,2,3) can be viewed as a point in a 3D space. So multivariable function is about associating one point in one space to another point in another space. For example, f(x,y,z)=(xy,yz,zx) is associating a point in 3D space to another point in the same space.

Another way to view a serial of numbers is to consider them as a vector, the notation is different, a point (1,2) is written as \begin{pmatrix}1 \\ 2\end{pmatrix} if viewed as vector. In the context of multivariable function, we often consider input as point and output as vector. If the output is a vector, then it's called vector-valued function, if it outputs a single number, we call it scalar-valued function(engineering) or real-valued function(math).

There are a couple of ways to visualize the multivariable function.

Graph: It has the benefit of showing the input space and output space in one graph but is limited in dimensions, two dimensions for input space and one for output space.

Contour map: It only shows the input space and is suitable for 2 dimensional input and 1 dimensional output.

a) the contour lines should be evenly spaced, for example z = .., -1, 0, 1 , 2, ..
b) each line should pass though every point in xy plane that has the same z value
c) label the z value beside each line
d) the closeness between the lines indicates the steepness of the graph, the closer they are, the steeper the graph is.

Parametric curves/surface: It is suitable for one dimensional input and 2 dimensional output and it loses most part of the input information but focus only on the output.

a) f(t)=\begin{pmatrix}cos(t) \\ sin(t)\end{pmatrix} draws a unit circle when t range from 0 to 2\pi
Vector fields:
b) given a curve, we want to parameterizing it(find a function to "draw" it)
c) to describe a curve, we need 1) f(t) 2) range of t

The following code illustrates it programmatically:

import matplotlib.pyplot as plt
import numpy as np


def preparation_axes(ax):
    """
        Center the left and bottom spines, eliminate upper and right spines, make x,y axes equally proportional
            Parameters
            ----------
            ax: a matplotlib.axes.Axes object

            Returns
            -------
            None
    """
    # center spines
    ax.spines['left'].set_position('center')
    ax.spines['bottom'].set_position('center')

    # make unnecessary spines disappear
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')

    # make x,y spines same size
    ax.axis("equal")


fig = plt.figure()

# data for plot1, a circle
t = np.linspace(0,2*np.pi,100)
x = np.cos(t)
y = np.sin(t)

ax1 = fig.add_subplot(2, 1, 1)
preparation_axes(ax1)
ax1.plot(x,y)

# data for plot2, a loopy curve
t = np.linspace(0,8*np.pi,200)
x = np.cos(t) - 3 + 1/(2*np.pi)*t
y = np.sin(t)

ax2 = fig.add_subplot(2, 1, 2)
preparation_axes(ax2)
ax2.plot(x,y)

# show two plots
plt.show()

The parameteric curve is not restricted to one dimensional input, it can be two as long as it's less than that of the output space.
The following code draws a "doughnut", the corresponding input space is two dimensional and the output space is three.

import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.gca(projection='3d')

t = np.linspace(0,2*np.pi,100)
s = np.linspace(0,2*np.pi,100)

# grid input, note our input space is two dimensional
tt, ss = np.meshgrid(t,s)

# two radius for the doughnut
R=5
r=1

# output space is three dimensional
x = R*np.cos(tt) + r*np.cos(ss)*np.cos(tt)
y = R*np.sin(tt) + r*np.cos(ss)*np.sin(tt)
z = r*np.sin(ss)

ax.plot_surface(x, y, z, label='a doughnut')

# make axis proportional
ax.set_xlabel('X')
ax.set_xlim(-R, R)
ax.set_ylabel('Y')
ax.set_ylim(-R, R)
ax.set_zlabel('Z')
ax.set_zlim(-R, R)

plt.show()

Vector field: It can represents the input space and output space which are of the same dimensionality in one graph.

a) think about particles with velocity(2D) in 2D space, or with velocity(3D) in 3D space
b) the length of the vectors is not absolute, rather, it's relative, or all the points could have same length of vector but different color

Here is the code"

import matplotlib.pyplot as plt
import numpy as np
from numpy import ma

X, Y = np.meshgrid(np.arange(0, 2 * np.pi, .2), np.arange(0, 2 * np.pi, .2))
U = np.cos(X)
V = np.sin(Y)

plt.figure()
plt.title('Arrows scale with plot width, not view')
Q = plt.quiver(X, Y, U, V, units='width')
plt.show()

Transformation: Visualize how input space is moving to output space, tool to see what function does, not an ideal one to see function's properties

The following code illustrates the input space, just a horizontal line(1D), is transformed into a curve in xy plane(2D), the corresponding parametric function is:

f(t)=\begin{pmatrix}cos(t) \\ \frac{t}{2}sin(t)\end{pmatrix}

Note the red, blue, green points's change of position when the transformation occurs.

import matplotlib.pyplot as plt
import numpy as np

def preparation_axes(ax):
    """
        Center the left and bottom spines, eliminate upper and right spines, make x,y axes equally proportional
            Parameters
            ----------
            ax: a matplotlib.axes.Axes object

            Returns
            -------
            None
    """
    # center spines
    ax.spines['left'].set_position('center')
    ax.spines['bottom'].set_position('center')

    # make unnecessary spines disappear
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')

    # make x,y spines same size
    ax.axis("equal")

fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)

x = np.linspace(0,12,100)
y = np.linspace(0,0,100)

preparation_axes(ax1)
ax1.plot(x,y,color='y')

x1 = 0
y1 = 0
ax1.scatter(x1,y1,color='r')
x2 = np.pi/2
y2 = 0
ax1.scatter(x2,y2,color='b')
x3 = np.pi
y3 = 0
ax1.scatter(x3,y3,color='g')

ax2 = fig.add_subplot(2,1,2)
t = np.linspace(0,12,100)
x = np.cos(t)
y = t/2*np.sin(t)
preparation_axes(ax2)
ax2.plot(x,y,color="y")
t1 = 0
x1 = np.cos(t1)
y1 = t1/2*np.sin(t1)
ax2.scatter(x1,y1,color='r')
t2 = np.pi/2
x2 = np.cos(t2)
y2 = t2/2*np.sin(t2)
ax2.scatter(x2,y2,color='b')
t3 = np.pi
x3 = np.cos(t3)
y3 = t3/2*np.sin(t3)
ax2.scatter(x3,y3,color='g')
plt.show()




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

Tangent plane and local linearization

Given f(x,y) and (x_0,y_0), there is an unique plane that just "kiss" the graph at (x_0,y_0), or in other words, be tangent to the curve.

This plane is

L(x,y)=f_x(x_0,y_0)(x-x_0) + f_y(x_0,y_0)(y-y_0) + f(x_0,y_0)

, where f_x and f_y are the partial derivatives with respect to x and y.

It is also called local linearization. It's called local because it is evaluated at point (x_0,y_0). It's called linearization because if we denote it with vectors,

X_0=(x_0,y_0)
X=(x,y)
\triangledown{f(X_0)} = \begin{pmatrix}f_x(X_0) \\ f_y(X_0)\end{pmatrix}

then

L(X_0)=\triangledown f(X_0) \cdot (X-X_0) + f(X_0)

which is a linear.

This linear function L(X) can be much easier than f(X), and is a good approximation of it at the neighborhood of X_0.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

deep learning code

http://math.stackexchange.com/questions/727030/find-a-change-in-variable-that-will-reduce-the-quadratic-form-to-a-sum-of-square

from my_neural_network.mylib.my_new_func import *

from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
import numpy as np
import time
np.set_printoptions(threshold=np.nan)
mnist = read_data_sets("data", one_hot=True, reshape=False, validation_size=0)

# This is for one image
# [ [[0.][0.]...[0.]], .... ,[[0.][0.]...[0.]] ]
# Note for [0.], we do put one number in an array, this won't be strange
# when the value of "depth"(the forth dimension of images, currently 1) is larger than 1
# print(mnist.train.images)
# print(mnist.train.labels)
M = 28 * 28
K = 10
N = 60000
TEST_SIZE = 10000
LEARNING_RATE=1
IT = 100000
BATCH_SIZE = 30
Ws = []
Bs = []
Zs = []
As = []
network_architecture = [784, 30, 10]
L = len(network_architecture)
XT = mnist.test.images.reshape(TEST_SIZE,M).transpose()
YT = np.transpose(mnist.test.labels)
Bs = [np.random.randn(y, 1) for y in network_architecture[1:]]
Ws = [np.random.randn(y, x) for x, y in zip(network_architecture[:-1], network_architecture[1:])]
print("L:", L)


for it in range(0, IT):
    if it % 100 == 0:
        print("IT:",it, "************************************")

    batch_X, batch_Y = mnist.train.next_batch(BATCH_SIZE)
    batch_X = batch_X.reshape((BATCH_SIZE,784)).transpose()
    batch_Y = np.transpose(batch_Y)
    #show_image(batch_X)

    # forward
    current_x = batch_X
    Zs = [batch_X]
    As = [batch_X]
    for l in range(0, L-1):
        z = linear(current_x, Ws[l], Bs[l])
        Zs.append(z)
        if l != L-2:
            a = sigmoid(z)
        else:
            a = softmax(z)
        As.append(a)
        current_x = a

    # backprop
    g = derivative_of_log_likelihood_cost_function_to_yhat(batch_Y, As[L-1])
    start_t = time.time()
    for l in range(L-1, 0, -1):
        if l != L-1:
            f_prime = derivative_of_sigmoid_function_to_activation2(As[l], BATCH_SIZE, network_architecture[l])
        else:
            f_prime = derivative_of_softmax_function_to_activation2(As[l], BATCH_SIZE, network_architecture[l])

        g = np.einsum('ndk,kn->dn', f_prime, g) #http://stackoverflow.com/questions/35786249/tensor-multiplication-with-numpy-tensordot
        delta_b = (np.sum(g, axis=1) / BATCH_SIZE).reshape((network_architecture[l],1))
        delta_w = np.matmul(g, np.transpose(As[l-1])) / BATCH_SIZE
        Ws[l-1] -= LEARNING_RATE * delta_w
        Bs[l-1] -= LEARNING_RATE * delta_b
        # probagate the gradients to the next low level
        g = np.matmul(np.transpose(Ws[l-1]),g)

    if it % 100 == 0:
        current_x  = XT
        for l in range(0, L-1):
            z = linear(current_x, Ws[l], Bs[l])
            if l != L-2:
                a = sigmoid(z)
            else:
                a = sigmoid(z)
                #a = softmax(z)
            current_x = a

        actual_max_index = np.argmax(YT, axis=0)
        predicting_max_index = np.argmax(a, axis=0)
        count_equal = np.sum(actual_max_index == predicting_max_index)
        print("accuracy:",count_equal * 1.0/predicting_max_index.size)
        cost_overall = cross_entropy_overall(a, YT, TEST_SIZE)
        print("cost_overall:", cost_overall)
import numpy as np
import math


def derivative_of_log_likelihood_cost_function_to_yhat(ys, yhats):
    return ys / yhats * (-1)


def derivative_of_squared_error_cost_function_to_yhat(ys, yhats):
    return (ys - yhats) * (-1)


def derivative_of_softmax_function_to_activation(a):
    return a*(1-a)


def derivative_of_softmax_function_to_activation2(a, N, K):
    r = np.full((N,K,K), 0.01, dtype="float32")
    for k in range(0, N):
        s = a[:,k]
        for j in range(0, K):
            for i in range(0, K):
                if j == i:
                    r[k,i,j] = s[i] * (1-s[i])
                else:
                    r[k,i,j] = -s[i]*s[j]
    return r


def linear(X, W_, b_):
    return np.matmul(W_, X) + b_


def sigmoid(X):
    return 1.0 / (1 + math.exp(-X))


def softmax(X):
    max_by_col_X = np.max(X, axis=0)
    X = X - max_by_col_X
    sum_exp_X = np.sum(np.exp(X), axis=0)
    X  = np.exp(X) / sum_exp_X
    return X


def compute_Y_(X, W_, b_, K, N):
    #print("Y_ before:",np.matmul(W_, X))
    Y_ = np.matmul(W_, X) + b_
    max_by_col_Y_ = np.max(Y_, axis=0)
    Y_ = Y_ - max_by_col_Y_
    sum_exp_Y_ = np.sum(np.exp(Y_), axis=0)
    for k in range(0, K):
       for i in range(0, N):
           Y_[k, i] = math.exp(Y_[k, i])/sum_exp_Y_[i]
    return Y_

def compute_Y_2(X, W_, b_, K, N):
    #print("Y_ before:",np.matmul(W_, X))
    Y_ = np.matmul(W_, X) + b_
    return 1.0 / (1.0 + np.exp(-Y_))


def cross_entropy(ys_, ys):
    s = 0.0
    for i in range(0, ys_.size):
        if(ys[i] != 0):
            if(ys_[i] != 0):
                s += ys[i]*np.log10(2.0)*np.log10(ys_[i])

    return s * (-1.0)


def cross_entropy_overall(Y_, Y, N):
    s = 0.0
    for i in range(0, N):
        s += cross_entropy(Y_[:,i], Y[:,i])
    return s/N


def show_image(pixels):
    import matplotlib.pyplot as plt
    # Reshape the array into 28 x 28 array (2-dimensional array)
    pixels = pixels[:,0].reshape((28,28))

    # Plot
    plt.title("number")
    plt.imshow(pixels, cmap='gray')
    plt.show()


def sigmoid_python(x):
    return 1/( 1+ math.exp(-x))
sigmoid = np.frompyfunc(sigmoid_python,1,1)

'''
tests derivative_of_log_li kelihood_cost_function_to_yhat
'''
ys = np.array([1,2,3])
assert(derivative_of_log_likelihood_cost_function_to_yhat(ys, ys).all(-1))

'''
test sigmoid_python
'''
xs = np.array([[0,0],[0,0]])
print(sigmoid(xs))

print(1.0 / (1.0 + np.exp(xs)))

optimized derivative process

# backprop
    #g = derivative_of_log_likelihood_cost_function_to_yhat(batch_Y, As[L-1])
    start_t = time.time()
    for l in range(L-1, 0, -1):
        if l != L-1:
            g = g * (As[l] * (1 - As[l]))
        else:
            g = As[l] - batch_Y

        delta_b = (np.sum(g, axis=1) / BATCH_SIZE).reshape((network_architecture[l],1))
        delta_w = np.matmul(g, np.transpose(As[l-1])) / BATCH_SIZE
        Ws[l-1] -= LEARNING_RATE * delta_w
        Bs[l-1] -= LEARNING_RATE * delta_b
        # probagate the gradients to the next low level
        g = np.matmul(np.transpose(Ws[l-1]),g)

    # forward
    current_x = batch_X
    Zs = [batch_X]
    As = [batch_X]
    for l in range(0, L-1):
        #print("l:",l)
        z = linear(current_x, Ws[l], Bs[l])
        Zs.append(z)
        if l != L-2:
            a = relu(z)
        else:
            a = softmax(z)
        As.append(a)
        current_x = a

    # backprop
    #g = derivative_of_log_likelihood_cost_function_to_yhat(batch_Y, As[L-1])
    start_t = time.time()
    for l in range(L-1, 0, -1):
        if l != L-1:
            g = g * derivative_of_relu_function_to_activationz(As[l])
        else:
            g = derivative_of_log_likelihood_cost_function_to_activation(batch_Y, As[l])
        print("l:",l)
        print("g before:",g)
        delta_b = (np.sum(g, axis=1) / BATCH_SIZE).reshape((network_architecture[l],1))
        print("delta_b:", delta_b)
        delta_w = np.matmul(g, np.transpose(As[l-1])) / BATCH_SIZE
        Ws[l-1] -= LEARNING_RATE * delta_w
        Bs[l-1] -= LEARNING_RATE * delta_b
        # probagate the gradients to the next low level
        g = np.matmul(np.transpose(Ws[l-1]),g)
        print("g after:", g)

    if it % 100 == 0:
        current_x  = XT
        for l in range(0, L-1):
            z = linear(current_x, Ws[l], Bs[l])
            if l != L-2:
                a = relu(z)
            else:
                a = softmax(z)
            current_x = a
def derivative_of_log_likelihood_cost_function_to_activation(y, a):
    return a - y

def derivative_of_sigmoid_function_to_activation(a):
    return a * (1-a)

def derivative_of_relu_function_to_activation_python(a):
    if a > 0:
        return 1
    else:
        return 0

derivative_of_relu_function_to_activation = np.frompyfunc(derivative_of_relu_function_to_activation_python,1,1)
def derivative_of_relu_function_to_activationz(a):
    return derivative_of_relu_function_to_activation(a).astype("float32")

def linear(X, W_, b_):
    return np.matmul(W_, X) + b_

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def relu(x):
    #print("x:",x[:,0])
    return np.maximum(x, 0)

things learned:
1. input image pixels should be correctly placed in the matrix, better to show the pixels array to see whether it represents the desired pic. More generally, before processing any data, try to verify it first. defensive programming is useful in this circumstance because the next phase, the processing phase is a complicated one, wipe out any uncertainties/doubts before entering it.
2. I'd better learn more about theoretical aspect of deep learning, rather than started to code in the middle of somewhere. Not well mastered knowledge, especially the deductions, the derivatives did impede the progress of coding, especially when you couldn't tell whether it was a problem of code logic or one of derivative formula.
3. I have several years of experience of programming and machine learning, but I'm new to deep learning, I implemented the simple deep learning algorithm using an unfamiliar framework numpy, I admit that if the thing I want to realize and the tool I choose to use are both new, things will become messy.
4. I also followed two resources(resource1, resource2) at the same time to learn it, they are both quite good, but two at a time, not quite a good idea.
5. The idea of tensor is marvelous, but it really took me a while to wrap my head around how it works.

architecture: [784,10]
cost function: cross entropy
output activation function: softmax
activation function: none
accuracy: 92%~93%

architecture: [784, 200, 10]
cost function: cross entropy
output activation function: softmax
activation function: sigmoid
accuracy: 96%~97%

Is initialization of parameters very important in NN?
Yes, it is. not only does it influence the speed of the convergence, but also influence the accuracy on the test data, as the cost function here is often not a convex one, it may have multiple local minima.

why 100 image at a time(mini-batch)?
1. suitable for GPU computing
2. get a consensus of 100 images where to go makes the path more stable

what is an epoch?
iteration where ALL images haven been examined, one epoch may contain several mini-batches

what is dropout?
only a fraction of neurons work at each time(at this iteration, the weight doesn't update)
Evaluation always performs on the FULL neurons
implementation: it sets a percentage of output activation to 0(check update equations to see why this makes derivatives of weights and biases 0), the remaining are boosted so that the average of the original vector is not changed
Don't do for input layers and output layers

What does relu bring us?
a slightly speed up in convergence at beginning and a slightly increase in accuracy

what does learning rage decay bring us?
accuracy on test data is more stable

what does dropout bring us?
less overfitting -> cost on training data and test data don't diverge from certain point

Does learning rate relate to activation function?
I experienced that relu needs smaller learning rate than sigmoid, otherwise it will blow up the weights and biases.

Gradient exploding?

Deeper architecture is more likely to experience gradient exploding or vanishing, one way to cure this is regularization



xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

《没有色彩的多崎作和他的巡礼之年》读后感

这本书和《挪威的森林》同源,作者从同一个源头创作出了这个故事,如果说《挪威的森林》是带有私人性质的小说,这本书可以理解为是作者经历岁月的洗礼后对过去的重新审视。

作和“我”,白和直子,沙罗(黑)与绿子,灰田和渡边,赤(青)和永泽。

在和青、赤、黑交谈后,作得到了某种程度的解脱,三人也是。

“并不是一切都消失在了时间的长河里。那时,我们坚定的相信某种东西,拥有能坚定相信某种东西的自我。这样的信念绝不会毫无意义地烟云消散。” 这是作者对已逝未逝的理解。



xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

maths in deep learning

One layer full connected network

A few notations:

M, number of neurons in the layer, only one layer here
K, number of outputs
W, a K \times M weight matrix, w_{k,j} for the kth row and jth column of this matrix
N, number of samples
X, a M \times N input matrix, w_{j,i} for the jth row and ith column of this matrix
b, a K dimension bias vector, b_k for the kth element of this vector
B, a K \times N bias matrix, all columns are identical, with value b
\hat{Y}, a K \times N prediction output matrix, y_{k,i} for the kth row and ith column of this matrix
Y, a K \times N actual output matrix, y_{k,i} for the kth row and ith column of this matrix

The softmax function:

softmax(x_j)=\frac{exp(x_j)}{\sum_{j}exp(x_j)}

The cross-entropy loss function:

H_y(\hat{y})=-\sum_{i}y_{i}\log \hat{y_i}

Using notations above, our predictions are given by:

\hat{Y}=softmax(WX+B)

Out loss functions are given by:

L(w,b)=-\sum_{i=1}^N\sum_{k=1}^K y_{k,i} \log \frac{e^{\sum_{j=1}^M}(w_{k,j}x_{j,i})+b_k}{\sum_{l=1}^{K}e^{\sum_{j=1}^M}(w_{l,j}x_{j,i})+b_l}




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2017.1.9-2017.1.15

1.
Adaboost theory
《The Elements of Statistical Learning》, Ch10.1-10.4
@TODO
a) A summary on it(done)
b) adaboost for regression
c) adaboost for multiclass

2.
No free lunch theorem: there is no priori distinction between any ml algo, how well it performs is determined by how well the hypothesis fits the reality, namely P(h|d) fits P(f|d), where d is training data and h the hypothesis and f the relationship between input and output.

http://www.no-free-lunch.org/

3.
Decision Boundary belongs to the hypothesis, not the training data, because different hypothesis will give different decision boundary on the same training data.



xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

dealing overflow and underflow in softmax function

This article reproduces the content about dealing overflow and underflow problems in softmax function in the online book Deep Learning, part I, Numerical Computation.

A softmax function is:

softmax(x_i) = \frac{e^{x_i}}{\sum_{j=1}^{n}e^{x_j}}

Suppose vector x is a constant vector with value c, no matter what actual value c is, the softmax should be \frac{1}{n}. The problem is that when c is a very small negative scalar, e^c \rightarrow 0, so divided by zero exception, when c is a very large number, e^x \rightarrow \infty, so a typical underflow and overflow problem. The solution is instead of using x, use z=x-max{x}. Let'see why it doesn't change the original value:

softmax(x_i-max(i))=\frac{\frac{e^{x_i}}{e^{max(x)}}}{\frac{e^{x_1}}{e^{max(x)}}+...+\frac{e^{x_n}}{e^{max(x)}}}=\frac{e^{x_i}}{e^{x_1}+...+e^{x_n}}=softmax(x_i)

And let's see why it solves overflow and underflow problem, subtracting max(x) from each element will leave at least one term of value 1 in the dominator, so underflow problem solves, at the same time no overflow problem can possibly occur because no power can be too large. The following code verifies it.

Overflow and underflow problems lurk.

import math

def softmax(x, xs):
    s = 0.0
    for i in xs:
        s += math.exp(i)
    return math.exp(x) / s

x = +999999999999
xs = [x] * 3
y = -999999999999
ys = [y] * 3

print softmax(x, xs) # OverflowError: math range error
print softmax(y, ys) # ZeroDivisionError: float division by zero

Overflow and underflow problems disappear.

def softmax_refined(x, xs):
    m = max(xs)
    x = x - m
    s = 0.0
    for i in xs:
        s += math.exp(i-m)
    return math.exp(x)/s

print softmax_refined(x, xs) # 0.333333333
print softmax_refined(y, ys) # 0.333333333

If we further use \log softmax(x), there is still problem when x \rightarrow 0 as this time \log \rightarrow -\infty. But with the help of logarithm property \log_x {\frac{a}{b}}=\log_x {a}-\log_x {b}, this problem can be easily solved Here is the code:

def logsoftmax(x, xs):
    m = max(xs)
    x = x - m
    s = 0.0
    for i in xs:
        s += math.exp(i - m)
    return x - math.log(s, math.e)

x = -999999999999
xs = [x, 1 , 1000]
print "log softmax", logsoftmax(x, xs) # -1.000000001e+12, a very negative number yet valid

Cool.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

cross validation with oversampling

I came across this blog two weeks ago and I thought let's reproduce it when I had time.

The main idea of this article is that you must put your oversampling process in cross validation, NOT before it, although it will give a 'much better' result if you do so.

Wrong way:

 
import pandas as pd;
import numpy as np;
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc

data_path = "E:\\python_workspace\\kaggle\\not_kaggle\\ehg\\data\\tpehgdb_features.csv"
data = pd.read_csv(data_path, header=0,
                   names=["recording_ID", "channel", "gestation", "recording_time", "group", "rms", "fmed", "fpeak",
                          "sample_entropy", "preterm", "early_recording"], sep=",")
x = np.array(data.iloc[:, 5:9])
y = np.array(data.iloc[:, 9])
from imblearn.over_sampling import SMOTE

encoder = LabelEncoder()

sm = SMOTE(kind='regular')
X_resampled, y_resampled = sm.fit_sample(x, y)
y_encoded = encoder.fit_transform(y_resampled)

n_folds = 2
cv = StratifiedKFold(y_resampled, n_folds=n_folds, shuffle=True)

for train_index, test_index in cv:
    weight = np.ones(len(X_resampled[train_index]))
    rf = RandomForestClassifier(n_estimators=20, max_depth=None,
                                bootstrap=True, n_jobs=4,
                                min_samples_leaf=1, verbose=0,
                                class_weight="balanced")
    model = rf.fit(X_resampled[train_index], y_resampled[train_index], weight)

    # proba
    probas = model.predict_proba(X_resampled[test_index])
    # prediction
    y_pred = model.predict(X_resampled[test_index])

    # y_pred is 'string', need to be transformed to 'int' before evalution
    y_pred_encoded = encoder.fit_transform(y_pred)
    target_names = ["class_0", "class_1"]
    print(classification_report(y_encoded[test_index], y_pred_encoded, target_names=target_names))

    fpr, tpr, thresholds = roc_curve(y_encoded[test_index], probas[:, 1])
    roc_auc = auc(fpr, tpr)
    print "roc_auc", roc_auc

Note that

sm = SMOTE(kind='regular')
X_resampled, y_resampled = sm.fit_sample(x, y)

is put before the cross validation loop

for train_index, test_index in cv:
   ...

Despite of it, it gives a pretty good result:

precision recall f1-score support auc
class_0 0.77 0.71 0.74 393 0.83
class_1 0.73 0.78 0.76 393

Right way:

import pandas as pd;
import numpy as np;
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler


data_path = "E:\\python_workspace\\kaggle\\not_kaggle\\ehg\\data\\tpehgdb_features.csv"
data = pd.read_csv(data_path, header=0, names = ["recording_ID","channel","gestation","recording_time","group","rms","fmed","fpeak","sample_entropy","preterm","early_recording"], sep=",")
x = np.array(data.iloc[:, 5:9])
y = np.array(data.iloc[:, 9])
# transform y
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

n_folds=2
cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True)

for train_index, test_index in cv:
    ros = RandomOverSampler()
    from imblearn.over_sampling import SMOTE

    sm = SMOTE(kind='regular')
    X_resampled, y_resampled = sm.fit_sample(x[train_index], y[train_index])
    weight = np.ones(len(X_resampled))

    rf = RandomForestClassifier(n_estimators=20, max_depth=None,
                                bootstrap=True, n_jobs=4,
                                min_samples_leaf=1, verbose=0,
                                class_weight="balanced")
    model = rf.fit(X_resampled, y_resampled, weight)
    # proba, for auc
    probas = model.predict_proba(x[test_index])
    # prediction, for precision and recall
    y_pred = model.predict(x[test_index])
    # y_pred is 'string', need to be transformed to 'int' before evalution
    y_pred_encoded = encoder.fit_transform(y_pred)
    target_names = ["class_0", "class_1"]
    print(classification_report(y_encoded[test_index], y_pred_encoded, target_names=target_names))

    fpr, tpr, thresholds = roc_curve(y_encoded[test_index], probas[:, 1])
    roc_auc = auc(fpr, tpr)
    print "roc_auc",roc_auc

Note that now the oversampling is in the cross validation loop. And the result is indeed poor:

precision recall f1-score support auc
class_0 0.87 0.81 0.84 393 0.54
class_1 0.11 0.16 0.13 57

While reproducing the result, I find that what I thought about several things were wrong:
1) I thought the blogger meant oversampling in the cross validation produced better result
2) I thought the max depth of random forest the number of features

The truth is:
1) blogger really meant oversampling in the cross validation produced poorer result
2) max depth of random forest could be larger than the total number of feature as one feature could be reused in deeper layer

This implies that even attentive reading may not necessarily means that you know what one tries to illustrate or whether what you've already known is true or not, the only to verify this is doing it by your own, see what it gives, does it meet your expectation, does it break it?


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2016.1.2-2017.1.8

1.
@TOREAD

http://www.alfredo.motta.name/cross-validation-done-wrong/

2.
pip runs behind a proxy
. set http_proxy and https_proxy environment correctly enables it

PyPI is for Python Packages Index, used by pip

3.
[[
1.
Tags:cross validation, imbalanced data
An excellent article about how to properly do cross-validation on imbalanced data.
how to properly do cross-validation on imbalanced data
@TODO reproduce the result
]]
while trying to reproduce the result I find what I thought about the 2 following things are wrong:
a) doing oversampling properly doesn't increase the evaluations result at all, it's doing it wrong produces a faked high result
b) the max value of the parameter 'max-depth' in random forest isn't the total feature number as the feature can be reused when constructing the tree

4.
reproduce Deep Learning Book -> Numerical Computation -> refinement of softmax function -> refinement of log softmax

5.
classifier tests

http://www.jmlr.org/papers/volume15/delgado14a/delgado14a.pdf

@TODO
A summary on it
@TODO python classfier comparison plot

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2016个人总结

2016这年总的来说算是跌宕起伏,承转启合。

工作
工作方面没什么进展,几乎在混,这点感觉很不好。希望在新的一年有所成。

知识
做的好的
年末空闲的一段时间温习了《线性代数》,输出了几篇英语总结,觉得有所收获。希望2017年可以继续学习基础课程,继续用英语写作。
编程方面成为了scikit-learn的contributor还是挺有成就感的,开源编程的路要继续走。

遗憾的
画画和编游戏没能坚持挺可惜。


书总体来说买的比读的多很多,希望17年不要买那么多书,但读几本好书。
读完的
《梵高手稿》
《在宇宙间不易被飞吹散》
等等

很多书读了一点
《日本AV影像史》
《三体》
《Alone on the wall》
《或者为了讲述》
等等

爱好
最大收获是开始拼高达模型了,很喜欢,感觉时间在一个模型中沉淀下来。

爱情
年末收获了爱情,多余的话不说。

2016就这样过去了,想想觉得有点不可思议,我希望在2017年我个人在沉下心来做一件事上有更多的体味,希望有情人终成眷属,希望自己可以更淡定、更从容、更爱生活也更爱思考。


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2016.12.26-2017.1.1

1.
Tags:cross validation, imbalanced data
An excellent article about how to properly do cross-validation on imbalanced data.
how to properly do cross-validation on imbalanced data
@TODO: reproduce the result

2.
Tags:life, health, food
Didn't prepare much on closes at Sunday -> Monday morning too busy to forget the food cart -> Buy creamy bread instead -> Cause diarrhea

3.
Tags:life, health
Sleep at night around 11pm makes stomach feel good the next day

4.
Tags:psychology
One way to define character is given a specific situation and see what behavior is invoked. From this point of view, it is reckless to simply label someone with extroverted as he or she may behave quite differently in some situation, say in a introverted way. I think situation-based thinking is not bad, anything that we may use as a mean later should be observed, learned and perceived under a specific situation.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

一个关于死亡与生命领悟的函数

y = 1 - e^{\frac{1}{x-c}}

公式里,y表示对生命的理解,x表示当前的生命力,当x=c时,人死亡。

这个函数之所以可以描述我理解的死亡和生命领悟的关系是因为
1. 当x=c时,函数无定义
2. \lim_{x \to c}y = 1
3. \lim_{x \to -\infty}y = 0
4. 函数在(-\infty,c)上单调增


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

Deep Learning Book

Deep Learning Book Intro Development of AI 1. knowledge base: hard-code knowledge 2. machine learning: machine that learns from the data

  1. depends heavily on representation of data,called feature(Figure 1.1)
  2. can not have influence on feature reciprocally

3. representation learning: learn representation itself, encode input data to a new representation, preserving as much infomation as possible Figure 1.1 two representations of the same data, one is more easily separated by a line than another   Feature Design 1. find factors that can separate different sources of influence 2. difficult because

  1. not always directly visible
  2. even an abstract concept only existing in human mind that can simplify the real world data
  3. many factors influence every single part of the data: red car and black car look like "black" in the night(light as factor), car's silhouette depends on the viewing angle(angle as factor)

Then Deep Learning:

  1. allowing computer build complex concept out of simpler concepts
  2. each layer can be thought as a state of computer memory after executing instructions in parallel, e.g. n-th layer is done, n+1-th layer starts, at the same time n-th layer starts for next iteration => more depth allows more instructions being executed at the same time

two ways of seeing depth

  1. depth of computational graph
  2. depth of probabilistic modeling graph

Historical Trends in Deep Learning 1. many names in history

    1. date back in 1940s
    2. inspired by brain for

1. brain is proved to be a possible util for intelligent behaivor 2. interesting to do some rearch on brain 3. model deep leanring goes beyond neural network, just a composition 4. earliest model is linear model f(x, w)

Because the training data does not show the desired output for each of these layers, theselayers are called hidden layers

The weights in each layer are waiting for be trained, so not given at first, so called hidden(weights) layer

In most cases, our parametric model defines a distribution p(y | x;θ) and we simply use the principle of maximum likelihood. This means we use the cross-entropy between the training data and the model’s predictions as the cost function.

see answer here to know more

The largest difference between the linear models we have seen so far and neural networks is that the nonlinearity of a neural network causes most interesting loss functions to become non-convex. This means that neural networks are usually trained by using iterative, gradient-based optimizers that merely drive the costfunction to a very low value, rather than the linear equation solvers used to trainlinear regression models or the convex optimization algorithms with global conver-gence guarantees used to train logistic regression or SVMs. Convex optimization converges starting from any initial parameters (in theory—in practice it is very robust but can encounter numerical problems). Stochastic gradient descent appliedto non-convex loss functions has no such convergence guarantee, and is sensitive to the values of the initial parameters.

Loss function used in neural network is not convex, so

a. not guaranteed to reach global minimum
b. sensitive to initial parameters

J(\theta)=-E_{x,y \sim {\hat{p}}_{data}}\ \log{p_{model}}\ (y|x)


The E_{x,y \sim {\hat{p}}_{data}} part's explicit form is \sum_{i=0}^{N}. Some interpretations:

a. Although our hypothesis h_{\theta} doesn't show in the above cost function, it doesn't and can't disappear, in fact it resides in p_{model}
b. If our hypothesis fits the data well, p_{model}(y|x) \rightarrow 1 and log{(p_{model}(y|x)} \rightarrow 0

One recurring theme throughout neural network design is that the gradient of the cost function must be large and predictable enough to serve as a good guide for the learning algorithm. Functions that saturate (become very flat) undermine this objective because they make the gradient become very small. In many cases this happens because the activation functions used to produce the output of the hidden units or the output units saturate. The negative log-likelihood helps to avoid this problem for many models. Many output units involve an exp function that can saturate when its argument is very negative. The log function in the negative log-likelihood cost function undoes the exp of some output units. We will discuss the interaction between the cost function and the choice of output unit in section 6.22.

Log function can

transform

a very small number to a much "bigger" one, e.g. log_{10}(0.000001)=-6, and we don't want small output as it makes convergence very very low.

One unusual property of the cross-entropy cost used to perform maximum likelihood estimation is that it usually does not have a minimum value when applied to the models commonly used in practice. For discrete output variables, most models are parametrized in such a way that they can not represent a probability of zero or one, but can come arbitrarily close to doing so. Logistic regression is an example of such a model. For real-valued output variables, if the model can control the density of the output distribution (for example, by learning the variance parameter of a Gaussian output distribution) then it becomes possible to assign extremely high density to the correct training set outputs, resulting in cross-entropy approaching negative infinity. Regularization techniques described in chapter 7 provide several different ways of modifying the learning problem sothat the model cannot reap unlimited reward in this way.

h_{\theta}=\frac{1}{1+e^{\theta^{T}x}}


It's easy to see that h_{\theta} can't be 0 nor be 1, but can arbitrary close to them. So it can't represent a probability. But the part that high density leads to very negative cross entropy, I don't understand it, I think low density leads to it, not high one.

From this point of view, wecan view the cost function as being a functional rather than just a function. A functional is a mapping from functions to real numbers. We can thus think of learning as choosing a function rather than merely choosing a set of parameters.

f = \underset{f}{argmin}\ E_{x,y \sim p_{data}}(y - f(x))^2 \quad (1)


f^{*}=E_{x,y \sim p_{data}}[y] \quad (2)

If we finally mange minimizing the squared error, we also find a good predictor to E(y). Different statistics can be obtained via different cost function, e.g. squared error -> expected value, absolute error -> median, etc. From this point of view, we say we learned a predictor to a statistic.

The choice of cost function is tightly coupled with the choice of output unit. Most of the time, we simply use the cross-entropy between the data distribution and the model distribution. The choice of how to represent the output then determines the form of the cross-entropy function.

The sigmod function, softmax function as well as gaussian function are not essentially different, they are just forms of the very "cross entropy" loss function under different probability distributoin assumption.

Satisfying this constraint requires some careful design effort. Suppose we were to use a linear unit, and threshold its value to obtain a valid probability:

P(y=1|x)=max(0, (min(1, w^Tx+b)))

Any time that w^h+b strayedout side the unit interval, the gradient of the output of the model with respect to its parameters would be 0. A gradient of 0 is typically problematic because the learning algorithm no longer has a guide for how to improve the corresponding parameters.

the y that outside unit interval is constant, either 0 or 1, this makes the corresponding gradient 0, resulting in saturation.

\log{\tilde{P(y)}=yz} \quad (1)
\tilde{P(y)}=exp(yz) \quad (2)
P(y)=\frac{exp(yz)}{\sum_{y'=0}^1\ exp(yz)} \quad (3)
P(y)=\sigma{(1-2y)z} \quad (4)
where
\sigma{(x)}=\frac{1}{1+exp(-x)}

While in many lectures we heard why P(y=1) is equal to sigmoid function of x in plain words, the above deduction gives a rigorous reason. From it we can see that y indeed participates in the deduction of the formula, and it later disappears only when it is substituted by it's binary value 0 and 1.

\begin{equation}\begin{split}J(\theta) & = -\log{P(y)} \\ & = -\log{\sigma{((1-2y)z)}} \\ & =\zeta{((1-2y)z)}\end{split}\end{equation}
where
\zeta(x) = log (1 + exp(x))

This derivation makes use of some properties from section 3.10. By rewriting the loss in terms of the softplus function, we can see that it saturates only when(1−2y)z is very negative. Saturation thus occurs only when the model already has the right answer—when y= 1 and z is very positive, or y= 0 and z is very negative.

Since softplus function is only slightly different than max(0,x), it is linear when x>0, so its gradient properties must be similar to x, which is the sigmoid function. The sigmoid function saturates when z is very positive or very negative, so if predicting correctly, this is a fair reasonable saturation.

When z has the wrong sign, the argument to the softplus function,(1−2y)z, may be simplified to|z|. As|z|becomes large while z has the wrong sign,the softplus function asymptotes toward simply returning its argument|z|. The derivative with respect to z asymptotes to sign(z), so, in the limit of extremely incorrect z, the softplus function does not shrink the gradient at all. This property is very useful because it means that gradient-based learning can act to quickly correct a mistaken z.

When wrong, z is corrected very quickly.

When we use other loss functions, such as mean squared error, the loss can saturate anytimeσ(z) saturates. The sigmoid activation function saturates to 0 when z becomes very negative and saturates to 1 when z becomes very positive.The gradient can shrink too small to be useful for learning whenever this happens,whether the model has the correct answer or the incorrect answer. For this reason,maximum likelihood is almost always the preferred approach to training sigmoid output units.

This paragraph summarizes why log-likelihood loss function plus sigmoid activation function is always a best choice, it is because it changes the situation that sigmoid function saturates whenever z is too positive or too negative no matter whether it correctly make the classification.

softmax(z)_i = \frac{exp(z_i)}{\sum_j{exp(z_j)}}
where
z=w^Tx+b
logsoftmax(z)=z_i-\log{\sum_j{e^{z_j}}}

The intuition we can gain from this approximation is that the negative log-likelihood cost function always strongly penalizes the most active incorrect prediction. If the correct answer already has the largest input to the softmax, then the z_i term and the logexp(z_j)≈max(z_j)=z_i terms will roughly cancel.

when multiple classification, the value of the likelihood(the value of softmax function) that "survives" in the cost function is the one with the actual y_i(the only not zero one), if the model is not yet good, this value should be far away from 1. This occurs only when z_i is not max(z), so it doesn't cancel, so it needs to be penalized.

Some of the hidden units included in this list are not actually differentiable at all input points. For example, the rectified linear function g(z) =max{0, z}is not differentiable at z= 0. This may seem like it invalidates g for use with a gradient-based learning algorithm. In practice, gradient descent still performs well enough for these models to be used for machine learning tasks. This is in part because neural network training algorithms do not usually arrive at a local minimum of the cost function, but instead merely reduce its value significantly, as shown in figure 4.3. These ideas will be described further in chapter 8. Because we do not expect training to actually reach a point where the gradient is0, it is acceptable for the minima of the cost function to correspond to points with undefined gradient.

The reason why we don't need activation function to be differentiable at every point is that we DONT expect the cost function arrives at the local minimum but rather reduced significantly.

Unless indicated otherwise, most hidden units can be described as accepting a vector of inputsx, computing an affine transformation z=W^Tx+b, andthen applying an element-wise nonlinear function g(z). Most hidden units are distinguished from each other only by the choice of the form of the activation function g(z).

Linear transformation + activation transformation

Unlike piecewise linear units, sigmoidal units saturate across most of their domain—they saturate to a high value when z is very positive, saturate to a low value when z is very negative, and are only strongly sensitive to their input when z is near 0. The widespread saturation of sigmoidal units can make gradient-based learning very difficult. For this reason,their use as hidden units in feedforward networks is now discouraged. Their use as output units is compatible with the use of gradient-based learning when an appropriate cost function can undo the saturation of the sigmoid in the output layer.

The big problem of sigmoid function is that it saturates in most their range. It was used as the activation function but not any more.

The concept of Borel measurability is beyond the scope of this book; for our purposes it suffices to say that any continuous function on a closed and bounded subset of Rn is Borel measurable and therefore may be approximated by a neural network.

Cool.

However, we are not guaranteed that the training algorithm will be able to learn that function. Even if the MLP is able to represent the function, learning can fail for two different reasons. First, the optimization algorithm used for training.Second, the training algorithm might choose the wrong function due to overfitting.

bad case and too good to be true case.

Recall from section 5.2.1 that the “no free lunch” theorem shows that there is no universally superior machine learning algorithm. Feedforward networks provide a universal system for representing functions, in the sense that, given a function, there exists a feedforward network that approximates the function. There is no universal procedure for examining a training set of specific examples and choosing a function that will generalize to points not in the training set.

No free lunch.

Barron(1993) provides some bounds on the size of a single-layer network needed to approximate a broad class of functions.Unfortunately, in the worse case, an exponential number of hidden units (possiblywith one hidden unit corresponding to each input configuration that needs to be distinguished) may be required.

one unit per record, that is huge.

In summary, a feedforward network with a single layer is sufficient to represent any function, but the layer may be infeasibly large and may fail to learn and generalize correctly. In many circumstances, using deeper models can reduce the number of units required to represent the desired function and can reduce the amount of generalization error.

deeper, simpler.

Choosing a deep model encodes a very general belief that the function we want to learn should involve composition of several simpler functions. This can be interpreted from a representation learning point of view as saying that we believe the learning problem consists of discovering a set of underlying factors of variation that can in turn be described in terms of other, simpler underlying factors of variation. Alternately, we can interpret the use of a deep architecture as expressing a belief that the function we want to learn is a computer program consisting of multiple steps, where each step makes use of the previous step’s output. These intermediate outputs are not necessarily factors of variation, but can instead be analogous to counters or pointers that the network uses to organize its internal processing.

a. We think the learning problem can be represented by a set of factors of a set of factors
b. We think the learning problem can be solved by sequential steps with the each step accepts the output of the previous step and pass its output to the next step


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

TorchCraft

It is a bridge between Torch and Starcraft: Broodwar. It is seen by AI programmer as a library that provides:
1. connect()
2. receive()
3. send()

github:

https://github.com/TorchCraft/TorchCraft

paper:

https://arxiv.org/pdf/1611.00625v2.pdf

further readings:
[16, 17]: existing surveys on RTS and starcraft
[26, 23]: research involve simple models or limited experimental settings
[25, 18, 19, 21]: models trained on high skilled players
[1]: low level interface used by Torch to connect to Starcraft


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

one-hot encoding

a one-hot encoding example

one-hot encoding's objective is make string(categorical) field numerical

say a field has three categorical values {'red', 'blue', 'green'}

a data with this field could be:
red
blue
blue
green

after one-hot encoding process, the above data is transformed to
red blue green
1 0 0
0 1 0
0 1 0
0 0 1

comparing to simple label encoder, which gives each categorical value
a numerical value, one-hot encoding has no info lost, but it elevates
the number of dimension as a sacrifice.

import pandas as pd;
import numpy as np;
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

train_data_path = "E:\\python_workspace\\kaggle\\competitions\\ghostsgoblinsghouls\\data\\train\\train.csv"
train_data = pd.read_csv(train_data_path, header=0, names = ["id","bone_length","rotting_flesh","hair_length","has_soul","color","type"], sep=",")
x = np.array(train_data.iloc[:, 1:6])
y = np.array(train_data.iloc[:, 6])

# first we need to use label encoder to transform string to numeric, it's sklearn's one-hot encoding input constrain
color_col = x[:, 4]
encoder = LabelEncoder()
color_col_encoded = encoder.fit_transform(color_col)
print color_col_encoded

# new color_col_encoded overwrites old one
x[:,4] = color_col_encoded

# then we use one-hot encoding, we specify the column that we want to transform
# and set sparse to False, this makes the final result more readable
# the new column will show at the beginning of the matrix
onehotencoder = OneHotEncoder(categorical_features = [4], sparse = False)
color_col_encoded_encoded = onehotencoder.fit_transform(x)
print onehotencoder.n_values_
print color_col_encoded_encoded[0,:]




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

a small git case

----------Updated 2016/11/25-----------
A better workflow should be:
git remote add upstream XXX.git
git checkout -b my_fix upstream/master
some modifications going on here
git add .
git commit
git push origin my_fix

-------------------------------------------

1. In GitHub, fork a project p
2. Download p
3. Import p in IDE
4. Modify one file of p, say f
5. Using GitBash
5.1.git add f
5.2 git commit -m "msg"
5.3 git push
6. In GitHub, in your forked project, click new 'New pull request' to submit the request
7. Wait patiently for the merge for your request

Notes
Step 5.3 is error-prone:
1.
Problem:
fatal: unable to access 'https://github.com/pldtc325/scikit-learn.git/': Failed to connect to your.proxy.com port 8080: Bad access
Solution:

http://stackoverflow.com/questions/34942310/git-push-failed-to-connect-to-gitlab-com-port-443-bad-access

2.
Problem:
Username for 'https://github.com': pldtc325
remote: Invalid username or password.
fatal: Authentication failed for 'https://github.com/pldtc325/scikit-learn.git/'
Solution:
First your username is not the email id you use to login in, second make sure you enter the right password


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

numpy

import numpy as np

# arrange, start from 0
print np.arange(3) # [0 1 2]

# dim
a = np.array([1])
print "a.ndim", a.ndim # 1

b = np.array([1,2]) # 1
print "b.ndim", b.ndim

c = np.array([[1, 2, 3], [3, 4, 5]]) # 2
print "c.ndim", c.ndim

d = np.array([[[1, 2, 3], [3, 4, 5]],[[6, 7, 8], [9, 10, 11]]]) # 3
print "d.ndim", d.ndim

# shape
print "c.shape", c.shape # (2L, 3L)
print "d.shape", d.shape # (2L, 2L, 3L)

# atleast_1d
# to at least 1 dim
# array for dim = 1
# sequence for dim > 1
e = np.atleast_1d([1,2,3]) # dim = 1, so e is array
print "e", e, "e.shape", e.shape

e = np.atleast_1d([1, 2, 3], [4, 5, 6]) # dim > 1, so e is sequence
print "e", e

# reshape
f = np.arange(6) # [0,1,2,3,4,5]
print "f", f, "f.shape", f.shape
f = np.reshape(f, (2,3))
print "f", f, "f.shape", f.shape # (2L, 3L)
f = np.array([[1,2,3],[3,4,5]])
print f.reshape(-1).shape # 6

# zeros
g = np.zeros(3, dtype=np.int16)
print "g", g # [0,0,0]
g = np.zeros((3,1), dtype=np.int16)
print "g", g # [[0]
              #  [0]
              #  [0]]
g = np.zeros((2,3), dtype=np.int16)
print "g", g

# unique
h1 = np.unique(np.array([11,12,13,11,13]))
print "h1", h1
h2 = np.unique(np.array([11,12,13,11,13]), return_inverse=True) # we got index according to unique value, hence we can restored values using it
print "h2", h2, "h2_restored", h1[h2[1]]
h3 = np.unique(np.array([11,12,13,11,13]), return_index=True) # we got index according to unique value, hence we can restored values using it
print "h3", h3 # [0, 1, 2]
h4 = np.unique(np.array([11,12,13,11,13]), return_counts=True)
print "h4", h4 # [2, 1, 2]

# prod
i = np.prod(np.array([2,3]))
print "i", i
i = np.prod(np.array([2,3]), axis=0)
print "i", i
#i = np.prod(np.array([2,3]), axis=1) # ValueError: 'axis' entry is out of bounds
print "i", i




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

scikit-learn randomforest

Download anacoda and install it to have scikit-learn right away.

Download pycharmand install it as IDE. Note that the professional version is friendly to Cython.

In Pycharm, set Project Interpreter path to anacoda path to make all python libs available.

#forest.py
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                 backend="threading")(
    delayed(_parallel_build_trees)(
        t, self, X, y, sample_weight, i, len(trees),
        verbose=self.verbose, class_weight=self.class_weight)
    for i, t in enumerate(trees))

sampling procedure(utils.class_weight.py, ensemble.forest.py)

Before entering individual tree building process, if it is not a subsample nor a balance_subsample or it is not boostrap, there will be a first sample weight computation on original data, which is contrast to later one that is on sampled data

After entering each tree building process, if it is indeed a boostrap and subsample or balance_subsample, there will be a second sample weight computation, which is on of course the sampled data.

Note that y doesn't change during sample weight computation, all infos are in sample weight itself

criterion computation(tree._criterion.pyx)

couple of optimizations going on here
start ----- pos ------------end
1. since sum_total = sum_left + sum_right and sum_total can be computed easily, we only need to computer either sum_left or sum_right, if pos is nearer to start than to end, we only computer sum_left because it involves less computations, vice versa

2. once a given pos P statistic has been computed, we will set this pos P as the new start for next computation since sum_left and sum_right which we computed at P can be used for tihis new pos as pos moves from left to right

splitter
in big picture, the splitter controls the position between start and end, and delegate the split's threshold computation to criterion.

use introsort to sort sample, it is a best sorting algorithm around, a hybird of quicksort, mergesort and insertion sort.

sklearn does 2 things remarkable during splitting process,
1. maintain a constant feature vector, which reduces the candidate feature range well
2. maintain a single sample array, which stores indices of X, use start, pos, end to designate each child node x range. Note after a node is split, the sample array can be reorginized in the way that makes the left part is for left child and right part for the right child

test on choices of parameters
number of trees: roc increases logarithmically when number of trees increases exponentially.
boostrap or not: surprisingly, whether the data is bosstrapped or doesn't have significant impact on roc, in fact, it even shows some negative relationship between them
max_depth: theoretically, it is recommended to build many many trees of small size, my data has only 5 fields, I set number of tree 10, the depth range is from 1 to 5, the result shows that a growing trend of roc with depth increasing. Nevertheless, more number of fields and deeper depth need to be invetigated.
min sample leaf: too few is not good, nor too many, in my example 5/333=1.5% seems good

Or you can use sklearn.grid_search to simplify the process of selecting best parameters.

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=0.5, random_state=0)

    tuned_parameters = [{'n_estimators': [1,5,10,20,30,100], 'max_depth': [None],
                         'bootstrap': [True, False], 'min_samples_leaf' :[0.01,0.02, 0.03, 0.04, 0.05]}]
    score = "f1"
    clf = GridSearchCV( RandomForestClassifier(), tuned_parameters, cv=30,
                           scoring=score,verbose=2)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

sklearn's GridSearch does nothing but exhausting every possible combinations of parameters. Some notes on it
don't expect that it gives you a stable result very soon for
1. parameters may have effect on each other, this needs to be considered at beginning
2. like any data analysis, the result depends heavily on the data, try to increase the cv(cross_validation times)
3.try to give a bigger stride when you design the possible candidates, otherwise it may give similar result in
a range
4. try it on more data and more choices may give you a more general sense of parameter tuning



xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

scikit-learn Parallel

scikit-learn provides its Parallel class to facilitate it parallel operations, for example, building trees in randomforest algorithm.

from time import sleep
from sklearn.externals.joblib import Parallel, delayed
if __name__ == '__main__':
    Parallel(n_jobs=2, verbose=10)(delayed(sleep)(1) for i in range(10))

We simply let each thread sleep for 1 second and we locate 2 threads to do it.

When verbose = 50, the output is:
[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.9s
[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 0.9s
[Parallel(n_jobs=2)]: Done 3 tasks | elapsed: 1.9s
[Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 1.9s
[Parallel(n_jobs=2)]: Done 5 tasks | elapsed: 3.0s
[Parallel(n_jobs=2)]: Done 6 tasks | elapsed: 3.0s
[Parallel(n_jobs=2)]: Done 7 tasks | elapsed: 4.0s
[Parallel(n_jobs=2)]: Done 11 out of 10 | elapsed: 4.0s remaining: 0.0s
[Parallel(n_jobs=2)]: Done 11 out of 10 | elapsed: 5.0s remaining: 0.0s
[Parallel(n_jobs=2)]: Done 11 out of 10 | elapsed: 5.0s remaining: 0.0s
[Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 5.0s finished

note that the lines is printed pairwisely since there are 2 threads working at the same time

When verbose = 10, the output is more concise:
[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.9s
[Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 1.9s
[Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 4.9s finished

@TODO
try more parameters of Parallel class

know more about Parallel class's root

reference
parallel.py in scikit-learn source code


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

Boostrapping and Bagging

Boostrapping

It is a sampling technique that draw repeated samples from the population of interest for n times.

Bagging

Bagging is an ensemble ML meta-algorithm using boostrapping sampling technique.

1. using boostrapping to sample data to get n samples
2. for each sample, fit a model
3. for new data, n models vote or average their results in case of classification or regression respectively


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息