0

blackjack

import gym
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
'''
This code illustrates when given a monte carole policy, how we
evaluate it.

The four figures corresponds to Figure 5.2 in the book:

http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf.

The simulator used is the one provided by OpenAI, 

https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py

The policy used is that the player sticks when 20 ro 21 and hits otherwise.
'''


class BlackJackMonteCaroleAgent(object):
    def __init__(self):
        # ace-10
        dealer_diff_cards_num = 10
        # 12-21, cards under 12 don't count as player can always hit safely
        player_diff_cards_num = 10
        # the player either has usable ACE(used as 11) or not
        # note that this status can be changed during the game
        usable_num = 2
        self.stick = 0
        self.hit = 1
        self.state_value_functions = np.zeros((usable_num, dealer_diff_cards_num, player_diff_cards_num))
        self.state_value_counts = np.zeros((usable_num, dealer_diff_cards_num, player_diff_cards_num))
        # self.avg_state_value_functions = self.state_value_functions/self.state_value_counts,
        # this is what we want to evaluate
        self.avg_state_value_functions = np.zeros((usable_num, dealer_diff_cards_num, player_diff_cards_num))
        # mappings
        self.is_usable_to_index = {True:1, False:0}
        self.cur_num_to_index= {}
        for i in range(12,22):
            self.cur_num_to_index[i] = i - 12

    def take_action(self, ob):
        cur_sum, dealer_num, is_usable = ob
        if cur_sum == 20 or cur_sum == 21:
            action = self.stick
        else:
            action = self.hit
        return action

    def update(self, obs, reward):
        for ob in obs:
            cur_sum, dealer_num, is_usable = ob
            if cur_sum < 12:
                continue
            cur_sum_index = self.cur_num_to_index[cur_sum]
            is_usable_index = self.is_usable_to_index[is_usable]
            dealer_num_index = dealer_num - 1
            self.state_value_counts[is_usable_index][dealer_num_index][cur_sum_index] += 1
            self.state_value_functions[is_usable_index][dealer_num_index][cur_sum_index] += reward

    def summarize(self):
        self.avg_state_value_functions = self.state_value_functions / self.state_value_counts
        return self.avg_state_value_functions


# helper function copied from OpenAI code, used for compute observation at the beginning of the game
def sum_hand(hand):  # Return current hand total
    if usable_ace(hand):
            return sum(hand) + 10
    return sum(hand)


# same as above
def usable_ace(hand):  # Does this hand have a usable ace?
    return 1 in hand and sum(hand) + 10 <= 21


# helper function used to plot
def plot(values):
    x = np.arange(10)
    y = np.arange(10)
    x, y = np.meshgrid(x, y)
    fig = plt.figure(figsize=(8, 8))
    n = 1
    episode_num_titles = {0:"10000 episodes", 1:"50000 episodes"}
    is_usable_titles = {0:'Do Not Have Usable ACE' , 1:'Do Have Usable ACE'}
    for i in range(len(values)):
        for k in range(2):
            ax = fig.add_subplot(2, 2, n, projection='3d')
            ax.set_title(is_usable_titles[k] + ', ' + episode_num_titles[i])
            ax.plot_surface(x, y, values[i][k])
            n += 1
    plt.show()

if __name__ == "__main__":
    env = gym.make('Blackjack-v0')
    agent = BlackJackMonteCaroleAgent()
    episode_nums = [10000, 50000]
    avgs_state_value_functions = []
    for episode_num in episode_nums:
        for i in range(episode_num):
            reward_sum = 0.0
            env.reset()
            # see what the value of the two cards the player already has at beginning
            cur_sum = sum_hand(env.player)
            # construct the first observation
            ob = cur_sum, env.dealer[0], usable_ace(env.player)
            done = False
            this_suite_obs = []
            while not done:
                action = agent.take_action(ob)
                old_ob = ob
                this_suite_obs.append(old_ob)
                ob, reward, done, _ = env.step(action)
                reward_sum += reward
            # update state value function statistic using ob right before done
            agent.update(this_suite_obs, reward_sum)

        avgs_state_value_functions.append(agent.summarize())
    plot(avgs_state_value_functions)




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

octave code

%%%%%%% CS 229 Machine Learning %%%%%%%%%%%
%%%%%%% Programming Assignment 4 %%%%%%%%%%
%%%
%%% Parts of the code (cart and pole dynamics, and the state
%%% discretization) are adapted from code available at the RL repository
%%% http://www-anw.cs.umass.edu/rlr/domains.html
%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% This file controls the pole-balancing simulation. You need to write
% code in places marked "CODE HERE" only.

% Briefly, the main simulation loop in this file calls cart_pole.m for
% simulating the pole dynamics, get_state.m for discretizing the
% otherwise continuous state space in discrete states, and show_cart.m
% for display.

% Some useful parameters are listed below.

% NUM_STATES: Number of states in the discretized state space
% You must assume that states are numbered 1 through NUM_STATES. The
% state numbered NUM_STATES (the last one) is a special state that marks
% the state when the pole has been judged to have fallen (or when the
% cart is out of bounds). However, you should NOT treat this state any
% differently in your code. Any distinctions you need to make between
% states should come automatically from your learning algorithm.

% After each simulation cycle, you are supposed to update the transition
% counts and rewards observed. However, you should not change either
% your value function or the transition probability matrix at each
% cycle.

% Whenever the pole falls, a section of your code below will be
% executed. At this point, you must use the transition counts and reward
% observations that you have gathered to generate a new model for the MDP
% (i.e., transition probabilities and state rewards). After that, you
% must use value iteration to get the optimal value function for this MDP
% model.

% TOLERANCE: Controls the convergence criteria for each value iteration
% run
% In the value iteration, you can assume convergence when the maximum
% absolute change in the value function at any state in an iteration
% becomes lower than TOLERANCE.

% You need to write code that chooses the best action according
% to your current value function, and the current model of the MDP. The
% action must be either 1 or 2 (corresponding to possible directions of
% pushing the cart).

% Finally, we assume that the simulation has converged when
% 'NO_LEARNING_THRESHOLD' consecutive value function computations all
% converged within one value function iteration. Intuitively, it seems
% like there will be little learning after this, so we end the simulation
% here, and say the overall algorithm has converged.

% Learning curves can be generated by calling plot_learning_curve.m (it
% assumes that the learning was just executed, and the array
% time_steps_to_failure that records the time for which the pole was
% balanced before each failure are in memory). num_failures is a variable
% that stores the number of failures (pole drops / cart out of bounds)
% till now.

% Other parameters in the code are described below:

% GAMMA: Discount factor to be used

% The following parameters control the simulation display; you dont
% really need to know about them:

% pause_time: Controls the pause between successive frames of the
% display. Higher values make your simulation slower.
% min_trial_length_to_start_display: Allows you to start the display only
% after the pole has been successfully balanced for at least this many
% trials. Setting this to zero starts the display immediately. Choosing a
% reasonably high value (around 100) can allow you to rush through the
% initial learning quickly, and start the display only after the
% performance is reasonable.

%%%%%%%%%% Simulation parameters %%%%%%%%%%

pause_time = 0.001;
min_trial_length_to_start_display = 0;
display_started=0;

NUM_STATES = 163;

GAMMA=0.995;

TOLERANCE=0.01;

NO_LEARNING_THRESHOLD = 20;

%%%%%%%%%% End parameter list %%%%%%%%%%

% Time cycle of the simulation
time=0;

% These variables perform bookkeeping (how many cycles was the pole
% balanced for before it fell). Useful for plotting learning curves.
time_steps_to_failure=[];
num_failures=0;
time_at_start_of_current_trial=0;

max_failures=500; % You should reach convergence well before this.

% Starting state is (0 0 0 0)
% x, x_dot, theta, theta_dot represents the actual continuous state vector
x = 0.0; x_dot = 0.0; theta = 0.0; theta_dot = 0.0;

% state is the number given to this state - you only need to consider
% this representation of the state
state = get_state(x, x_dot, theta, theta_dot);

if min_trial_length_to_start_display==0 || display_started==1
show_cart(x, x_dot, theta, theta_dot, pause_time);
end

%%% CODE HERE: Perform all your initializations here %%%

% Assume no transitions or rewards have been observed
% Initialize the value function array to small random values (0 to 0.10,
% say)
% Initialize the transition probabilities uniformly (ie, probability of
% transitioning for state x to state y using action a is exactly
% 1/NUM_STATES). Initialize all state rewards to zero.
value_function = rand(1,NUM_STATES) / 10;
% mdp = ones(NUM_STATES, NUM_STATES, 2)/ NUM_STATES;
mdp = zeros(NUM_STATES, NUM_STATES, 2);
rewards = zeros(1, NUM_STATES);
total_count = zeros(NUM_STATES, NUM_STATES, 2);

%for s = 1:NUM_STATES
% if s != NUM_STATES
% rewards(s) = 0;
% else
% rewards(s) = -1;
% endif
%endfor
%%%% END YOUR CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%% CODE HERE (while loop condition) %%%
% This is the criterion to end the simulation
% You should change it to terminate when the previous
% 'NO_LEARNING_THRESHOLD' consecutive value function computations all
% converged within one value function iteration. Intuitively, it seems
% like there will be little learning after this, so end the simulation
% here, and say the overall algorithm has converged.

% while num_failures no_learning_count = 0
while (no_learning_count < NO_LEARNING_THRESHOLD)

%%% CODE HERE: Write code to choose action (1 or 2) %%%

% This action choice algorithm is just for illustration. It may
% convince you that reinforcement learning is nice for control
% problems! Replace it with your code to choose an action that is
% optimal according to the current value function, and the current MDP
% model.
% if (theta<0)
% action=1;
% else
% action=2;
% end

cur_state = get_state(x, x_dot, theta, theta_dot);
value_action_1 = dot(mdp(cur_state,:,1), value_function);
value_action_2 = dot(mdp(cur_state,:,2), value_function);
if value_action_1 >= value_action_2
action = 1;
else
action = 2;
end
%if num_failures<-20
% if (rand(1) < 0.5)
% action=1;
% else
% action=2;
% end
%end

%%% END YOUR CODE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Get the next state by simulating the dynamics
[x, x_dot, theta, theta_dot] = cart_pole(action, x, x_dot, theta, theta_dot);

% Increment simulation time
time = time + 1;

% Get the state number corresponding to new state vector
new_state = get_state(x, x_dot, theta, theta_dot);

if display_started==1
show_cart(x, x_dot, theta, theta_dot, pause_time);
end

% Reward function to use - do not change this!
if (new_state==NUM_STATES)
R=-1;
else
%R=-abs(theta)/2.0;
R=0;
end

%%% CODE HERE: Perform updates %%%%%%%%%

% A transition from 'state' to 'new_state' has just been made using
% 'action'. The reward observed in 'new_state' (note) is 'R'.
% Write code to update your statistics about the MDP - i.e., the
% information you are storing on the transitions and on the rewards
% observed. Do not change the actual MDP parameters, except when the
% pole falls (the next if block)!

total_count(state, new_state, action) = total_count(state, new_state, action) + 1;
rewards(new_state) = R;
% Recompute MDP model whenever pole falls
% Compute the value function V for the new model
if (new_state==NUM_STATES)

% Update MDP model using the current accumulated statistics about the
% MDP - transitions and rewards.
% Make sure you account for the case when total_count is 0, i.e., a
% state-action pair has never been tried before, or the state has
% never been visited before. In that case, you must not change that
% component (and thus keep it at the initialized uniform distribution).

sum_action = sum(total_count,dim=2);
x = zeros(NUM_STATES, 2);
for s0 = 1:NUM_STATES
for s = 1:NUM_STATES
for a = 1:2
if total_count(s0,s,a) == 0
x(s0,a) += 1/NUM_STATES;
endif
endfor
endfor
endfor
for s0 = 1:NUM_STATES
for s = 1:NUM_STATES
for a = 1:2
if total_count(s0,s,a) > 0
mdp(s0, s, a) = (1-x(s0,a)) * total_count(s0,s,a) / sum_action(s0, a);
endif
endfor
endfor
endfor
% Perform value iteration using the new estimated model for the MDP
% The convergence criterion should be based on TOLERANCE as described
% at the top of the file.
% If it converges within one iteration, you may want to update your
% variable that checks when the whole simulation must end
max_diff_abs = 100;
iter = 0
while (max_diff_abs > TOLERANCE)
max_diff_abs
value_function_old = repmat(value_function,1);
for s = 1:NUM_STATES
value_action_1 = dot(mdp(s,:,1), value_function_old);
value_action_2 = dot(mdp(s,:,2), value_function_old);
max_value_action = max(value_action_1, value_action_2);
value_function(s) = rewards(s) + GAMMA * max_value_action;
endfor
value_function_old;
value_function;
max_diff_abs = abs(max(value_function .- value_function_old));
iter = iter + 1;
endwhile
iter
if (iter == 1)
no_learning_count = no_learning_count + 1
endif

pause(0.2); % You can use this to stop for a while!

end

%%% END YOUR CODE %%%%%%%%%%%%%%%%%%%

% Dont change this code: Controls the simulation, and handles the case
% when the pole fell and the state must be reinitialized
if (new_state == NUM_STATES)
num_failures = num_failures+1
time_steps_to_failure(num_failures) = time - time_at_start_of_current_trial;
time_at_start_of_current_trial = time;

time_steps_to_failure(num_failures)

if (time_steps_to_failure(num_failures)> ...
min_trial_length_to_start_display)
display_started=1;
end

% Reinitialize state
x = -1.1 + rand(1)*2.2
%x=0.0;
x_dot = 0.0; theta = 0.0; theta_dot = 0.0;
state = get_state(x, x_dot, theta, theta_dot);
else
state=new_state;
end
end

time_steps_to_failure
% Plot the learning curve (time balanced vs trial)
plot_learning_curve


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

assignment3

rnn_layers.py

import numpy as np


"""
This file defines layer types that are commonly used for recurrent neural
networks.
"""


def rnn_step_forward(x, prev_h, Wx, Wh, b):
  """
  Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
  activation function.

  The input data has dimension D, the hidden state has dimension H, and we use
  a minibatch size of N.

  Inputs:
  - x: Input data for this timestep, of shape (N, D).
  - prev_h: Hidden state from previous timestep, of shape (N, H)
  - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
  - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
  - b: Biases of shape (H,)

  Returns a tuple of:
  - next_h: Next hidden state, of shape (N, H)
  - cache: Tuple of values needed for the backward pass.
  """
  next_h, cache = None, None
  ##############################################################################
  # TODO: Implement a single forward step for the vanilla RNN. Store the next  #
  # hidden state and any values you need for the backward pass in the next_h   #
  # and cache variables respectively.                                          #
  ##############################################################################
  xWx_plus_hWh = x.dot(Wx) + prev_h.dot(Wh) + b
  next_h = np.tanh(xWx_plus_hWh)
  cache = xWx_plus_hWh, Wx, x, prev_h, Wh
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return next_h, cache


def rnn_step_backward(dnext_h, cache):
  """
  Backward pass for a single timestep of a vanilla RNN.
  
  Inputs:
  - dnext_h: Gradient of loss with respect to next hidden state
  - cache: Cache object from the forward pass
  
  Returns a tuple of:
  - dx: Gradients of input data, of shape (N, D)
  - dprev_h: Gradients of previous hidden state, of shape (N, H)
  - dWx: Gradients of input-to-hidden weights, of shape (N, H)
  - dWh: Gradients of hidden-to-hidden weights, of shape (H, H)
  - db: Gradients of bias vector, of shape (H,)
  """
  dx, dprev_h, dWx, dWh, db = None, None, None, None, None
  ##############################################################################
  # TODO: Implement the backward pass for a single step of a vanilla RNN.      #
  #                                                                            #
  # HINT: For the tanh function, you can compute the local derivative in terms #
  # of the output value from tanh.                                             #
  ##############################################################################
  xWx_plus_hWh, Wx, x, prev_h, Wh = cache
  dtanx = dnext_h * (1/(np.cosh(xWx_plus_hWh)) ** 2)
  db = np.sum(dtanx, axis=0)
  dx = dtanx.dot(Wx.T)
  dWx = x.T.dot(dtanx)
  dWh = prev_h.T.dot(dtanx)
  dprev_h = dtanx.dot(Wh.T)

  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dx, dprev_h, dWx, dWh, db


def rnn_forward(x, h0, Wx, Wh, b):
  """
  Run a vanilla RNN forward on an entire sequence of data. We assume an input
  sequence composed of T vectors, each of dimension D. The RNN uses a hidden
  size of H, and we work over a minibatch containing N sequences. After running
  the RNN forward, we return the hidden states for all timesteps.
  
  Inputs:
  - x: Input data for the entire timeseries, of shape (N, T, D).
  - h0: Initial hidden state, of shape (N, H)
  - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
  - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
  - b: Biases of shape (H,)
  
  Returns a tuple of:
  - h: Hidden states for the entire timeseries, of shape (N, T, H).
  - cache: Values needed in the backward pass
  """
  h, cache = None, None
  ##############################################################################
  # TODO: Implement forward pass for a vanilla RNN running on a sequence of    #
  # input data. You should use the rnn_step_forward function that you defined  #
  # above.                                                                     #
  ##############################################################################
  N, T, D = x.shape
  H = h0.shape[1]
  cache = []
  prev_h = h0
  h = np.zeros((N, T, H))
  for t in range(0, T):
    xt = x[:,t,:]
    ht, cachet = rnn_step_forward(xt, prev_h, Wx, Wh, b)
    h[:,t,:] = ht
    prev_h = ht
    cache.append(cachet)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return h, cache


def rnn_backward(dh, cache):
  """
  Compute the backward pass for a vanilla RNN over an entire sequence of data.
  
  Inputs:
  - dh: Upstream gradients of all hidden states, of shape (N, T, H)
  
  Returns a tuple of:
  - dx: Gradient of inputs, of shape (N, T, D)
  - dh0: Gradient of initial hidden state, of shape (N, H)
  - dWx: Gradient of input-to-hidden weights, of shape (D, H)
  - dWh: Gradient of hidden-to-hidden weights, of shape (H, H)
  - db: Gradient of biases, of shape (H,)
  """
  dx, dh0, dWx, dWh, db = None, None, None, None, None
  ##############################################################################
  # TODO: Implement the backward pass for a vanilla RNN running an entire      #
  # sequence of data. You should use the rnn_step_backward function that you   #
  # defined above.                                                             #
  ##############################################################################
  N, T, H = dh.shape
  xWx_plus_hWh, Wx, x, prev_h, Wh = cache[-1]
  D, H = Wx.shape
  db = np.zeros(H)
  dWx = np.zeros(Wx.shape)
  dWh = np.zeros(Wh.shape)
  dx = np.zeros((N, T, D))
  dprev_ht = np.zeros(prev_h.shape)
  for t in range(T-1, -1, -1):
    cachet = cache[t]
    dht = dh[:,t,:] + dprev_ht
    dxt, dprev_ht, dWxt, dWht, dbt = rnn_step_backward(dht, cachet)
    dx[:,t,:]=dxt
    dWx += dWxt
    dWh += dWht
    db += dbt
  # dh0 is not in the network, not receiving any upstream dh
  dh0 = dprev_ht
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dx, dh0, dWx, dWh, db


def word_embedding_forward(x, W):
  """
  Forward pass for word embeddings. We operate on minibatches of size N where
  each sequence has length T. We assume a vocabulary of V words, assigning each
  to a vector of dimension D.
  
  Inputs:
  - x: Integer array of shape (N, T) giving indices of words. Each element idx
    of x muxt be in the range 0 <= idx < V.
  - W: Weight matrix of shape (V, D) giving word vectors for all words.
  
  Returns a tuple of:
  - out: Array of shape (N, T, D) giving word vectors for all input words.
  - cache: Values needed for the backward pass
  """
  out, cache = None, None
  ##############################################################################
  # TODO: Implement the forward pass for word embeddings.                      #
  #                                                                            #
  # HINT: This should be very simple.                                          #
  ##############################################################################
  out = W[x]
  V, D = W.shape
  cache = x, W, V, D
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return out, cache


def word_embedding_backward(dout, cache):
  """
  Backward pass for word embeddings. We cannot back-propagate into the words
  since they are integers(so dx can be at least 1, too large to be differentiable), 
  so we only return gradient for the word embedding
  matrix.
  
  HINT: Look up the function np.add.at
  
  Inputs:
  - dout: Upstream gradients of shape (N, T, D)
  - cache: Values from the forward pass
  
  Returns:
  - dW: Gradient of word embedding matrix, of shape (V, D).
  """
  dW = None
  ##############################################################################
  # TODO: Implement the backward pass for word embeddings.                     #
  #                                                                            #
  # HINT: Look up the function np.add.at                                       #
  ##############################################################################
  x, W, V, D = cache
  N, T = x.shape
  dW = np.zeros((V, D))
  # this is not optimized, try to use np.add.at later
  for i in range(0, N):
    for j in range(0, T):
      v = x[i,j]
      d_dout = dout[i,j]
      dW[v] += d_dout
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dW


def sigmoid(x):
  """
  A numerically stable version of the logistic sigmoid function.
  """
  pos_mask = (x >= 0)
  neg_mask = (x < 0)
  z = np.zeros_like(x)
  z[pos_mask] = np.exp(-x[pos_mask])
  z[neg_mask] = np.exp(x[neg_mask])
  top = np.ones_like(x)
  top[neg_mask] = z[neg_mask]
  return top / (1 + z)


def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
  """
  Forward pass for a single timestep of an LSTM.
  
  The input data has dimension D, the hidden state has dimension H, and we use
  a minibatch size of N.
  
  Inputs:
  - x: Input data, of shape (N, D)
  - prev_h: Previous hidden state, of shape (N, H)
  - prev_c: previous cell state, of shape (N, H)
  - Wx: Input-to-hidden weights, of shape (D, 4H)
  - Wh: Hidden-to-hidden weights, of shape (H, 4H)
  - b: Biases, of shape (4H,)
  
  Returns a tuple of:
  - next_h: Next hidden state, of shape (N, H)
  - next_c: Next cell state, of shape (N, H)
  - cache: Tuple of values needed for backward pass.
  """
  next_h, next_c, cache = None, None, None
  #############################################################################
  # TODO: Implement the forward pass for a single timestep of an LSTM.        #
  # You may want to use the numerically stable sigmoid implementation above.  #
  #############################################################################
  N, H = prev_h.shape
  a = x.dot(Wx) + prev_h.dot(Wh) + b

  ai = a[:,0:H]
  af = a[:,H:2*H]
  ao = a[:,2*H:3*H]
  ag = a[:,3*H:4*H]

  i = sigmoid(ai)
  f = sigmoid(af)
  o = sigmoid(ao)
  g = np.tanh(ag)

  f_mul_prev_c = f * prev_c
  i_mul_g = i * g
  next_c = f_mul_prev_c + i_mul_g

  tanh_next_c = np.tanh(next_c)
  next_h = o * (tanh_next_c)

  cache = next_h, next_c, tanh_next_c, next_c, i_mul_g, f_mul_prev_c, i, f , o, g, prev_c, prev_h, ai, af, ao, ag, x, Wx, Wh
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  
  return next_h, next_c, cache


def lstm_step_backward(dnext_h, dnext_c, cache):
  """
  Backward pass for a single timestep of an LSTM.
  
  Inputs:
  - dnext_h: Gradients of next hidden state, of shape (N, H)
  - dnext_c: Gradients of next cell state, of shape (N, H)
  - cache: Values from the forward pass
  
  Returns a tuple of:
  - dx: Gradient of input data, of shape (N, D)
  - dprev_h: Gradient of previous hidden state, of shape (N, H)
  - dprev_c: Gradient of previous cell state, of shape (N, H)
  - dWx: Gradient of input-to-hidden weights, of shape (D, 4H)
  - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H)
  - db: Gradient of biases, of shape (4H,)
  """
  dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
  #############################################################################
  # TODO: Implement the backward pass for a single timestep of an LSTM.       #
  #                                                                           #
  # HINT: For sigmoid and tanh you can compute local derivatives in terms of  #
  # the output value from the nonlinearity.                                   #
  #############################################################################
  N,H = dnext_h.shape
  next_h, next_c, tanh_next_c, next_c, i_mul_g, f_mul_prev_c, i, f , o, g, prev_c, prev_h, ai, af, ao, ag, x, Wx, Wh = cache

  # do not forget the part from dnext_h
  dnext_c_all = dnext_c + dnext_h * o * (1/(np.cosh(next_c))**2)

  dai = dnext_c_all * g * (i * (1-i))
  daf = dnext_c_all * prev_c * (f * (1-f))
  dao = dnext_h * tanh_next_c * (o * (1-o))
  dag = dnext_c_all * i * (1/(np.cosh(ag))**2)

  da = np.zeros((N,4*H))
  da[:, 0:H] = dai
  da[:, H:2 * H] = daf
  da[:, 2 * H:3 * H] = dao
  da[:, 3 * H:4 * H] = dag

  dWx = x.T.dot(da)
  dWh = prev_h.T.dot(da)

  db = np.sum(da, axis=0)
  dprev_c = dnext_c_all * f
  dprev_h = da.dot(Wh.T)
  dx = da.dot(Wx.T)

  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################

  return dx, dprev_h, dprev_c, dWx, dWh, db


def lstm_forward(x, h0, Wx, Wh, b):
  """
  Forward pass for an LSTM over an entire sequence of data. We assume an input
  sequence composed of T vectors, each of dimension D. The LSTM uses a hidden
  size of H, and we work over a minibatch containing N sequences. After running
  the LSTM forward, we return the hidden states for all timesteps.
  
  Note that the initial cell state is passed as input, but the initial cell
  state is set to zero. Also note that the cell state is not returned; it is
  an internal variable to the LSTM and is not accessed from outside.
  
  Inputs:
  - x: Input data of shape (N, T, D)
  - h0: Initial hidden state of shape (N, H)
  - Wx: Weights for input-to-hidden connections, of shape (D, 4H)
  - Wh: Weights for hidden-to-hidden connections, of shape (H, 4H)
  - b: Biases of shape (4H,)
  
  Returns a tuple of:
  - h: Hidden states for all timesteps of all sequences, of shape (N, T, H)
  - cache: Values needed for the backward pass.
  """
  h, cache = None, None
  #############################################################################
  # TODO: Implement the forward pass for an LSTM over an entire timeseries.   #
  # You should use the lstm_step_forward function that you just defined.      #
  #############################################################################
  N, T, D = x.shape
  H = h0.shape[1]
  cache = []
  prev_h = h0
  h = np.zeros((N, T, H))
  # inital cell is zeros
  prev_c = np.zeros((N,H))
  for t in range(0, T):
    xt = x[:, t, :]
    next_h, next_c, cachet = lstm_step_forward(xt, prev_h, prev_c, Wx, Wh, b)
    h[:, t, :] = next_h
    prev_h = next_h
    prev_c = next_c
    cache.append(cachet)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################

  return h, cache


def lstm_backward(dh, cache):
  """
  Backward pass for an LSTM over an entire sequence of data.]
  
  Inputs:
  - dh: Upstream gradients of hidden states, of shape (N, T, H)
  - cache: Values from the forward pass
  
  Returns a tuple of:
  - dx: Gradient of input data of shape (N, T, D)
  - dh0: Gradient of initial hidden state of shape (N, H)
  - dWx: Gradient of input-to-hidden weight matrix of shape (D, 4H)
  - dWh: Gradient of hidden-to-hidden weight matrix of shape (H, 4H)
  - db: Gradient of biases, of shape (4H,)
  """
  dx, dh0, dWx, dWh, db = None, None, None, None, None
  #############################################################################
  # TODO: Implement the backward pass for an LSTM over an entire timeseries.  #
  # You should use the lstm_step_backward function that you just defined.     #
  #############################################################################
  N, T, H = dh.shape
  next_h, next_c, tanh_next_c, next_c, i_mul_g, f_mul_prev_c, i, f, o, g, prev_c, prev_h, ai, af, ao, ag, x, Wx, Wh = cache[-1]
  D, H = Wx.shape
  db = np.zeros(H)
  dWx = np.zeros(Wx.shape)
  dWh = np.zeros(Wh.shape)
  dx = np.zeros((N, T, D))
  dprev_ht = np.zeros(prev_h.shape)
  dprev_ct = np.zeros(prev_c.shape)
  for t in range(T - 1, -1, -1):
    cachet = cache[t]
    dht = dh[:, t, :] + dprev_ht
    dxt, dprev_ht, dprev_ct,dWxt, dWht, dbt = lstm_step_backward(dht,dprev_ct,cachet)
    dx[:, t, :] = dxt
    dWx += dWxt
    dWh += dWht
    db += dbt
  # dh0 is not in the network, not receiving any upstream dh
  dh0 = dprev_ht
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  
  return dx, dh0, dWx, dWh, db


def temporal_affine_forward(x, w, b):
  """
  Forward pass for a temporal affine layer. The input is a set of D-dimensional
  vectors arranged into a minibatch of N timeseries, each of length T. We use
  an affine function to transform each of those vectors into a new vector of
  dimension M.

  Inputs:
  - x: Input data of shape (N, T, D)
  - w: Weights of shape (D, M)
  - b: Biases of shape (M,)
  
  Returns a tuple of:
  - out: Output data of shape (N, T, M)
  - cache: Values needed for the backward pass
  """
  N, T, D = x.shape
  M = b.shape[0]
  out = x.reshape(N * T, D).dot(w).reshape(N, T, M) + b
  cache = x, w, b, out
  return out, cache


def temporal_affine_backward(dout, cache):
  """
  Backward pass for temporal affine layer.

  Input:
  - dout: Upstream gradients of shape (N, T, M)
  - cache: Values from forward pass

  Returns a tuple of:
  - dx: Gradient of input, of shape (N, T, D)
  - dw: Gradient of weights, of shape (D, M)
  - db: Gradient of biases, of shape (M,)
  """
  x, w, b, out = cache
  N, T, D = x.shape
  M = b.shape[0]

  dx = dout.reshape(N * T, M).dot(w.T).reshape(N, T, D)
  dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T
  db = dout.sum(axis=(0, 1))

  return dx, dw, db


def temporal_softmax_loss(x, y, mask, verbose=False):
  """
  A temporal version of softmax loss for use in RNNs. We assume that we are
  making predictions over a vocabulary of size V for each timestep of a
  timeseries of length T, over a minibatch of size N. The input x gives scores
  for all vocabulary elements at all timesteps, and y gives the indices of the
  ground-truth element at each timestep. We use a cross-entropy loss at each
  timestep, summing the loss over all timesteps and averaging across the
  minibatch.

  As an additional complication, we may want to ignore the model output at some
  timesteps, since sequences of different length may have been combined into a
  minibatch and padded with NULL tokens. The optional mask argument tells us
  which elements should contribute to the loss.

  Inputs:
  - x: Input scores, of shape (N, T, V)
  - y: Ground-truth indices, of shape (N, T) where each element is in the range
       0 <= y[i, t] < V
  - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
    the scores at x[i, t] should contribute to the loss.

  Returns a tuple of:
  - loss: Scalar giving loss
  - dx: Gradient of loss with respect to scores x.
  """

  N, T, V = x.shape
  
  x_flat = x.reshape(N * T, V)
  y_flat = y.reshape(N * T)
  mask_flat = mask.reshape(N * T)
  
  probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
  probs /= np.sum(probs, axis=1, keepdims=True)
  # y_flat signifies index in V;
  # the padding NULL is masked out, not contributing to the loss
  loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
  dx_flat = probs.copy()
  dx_flat[np.arange(N * T), y_flat] -= 1
  dx_flat /= N
  dx_flat *= mask_flat[:, None]
  
  if verbose: print 'dx_flat: ', dx_flat.shape
  
  dx = dx_flat.reshape(N, T, V)
  
  return loss, dx


rnn.py

import numpy as np

from cs231n.layers import *
from cs231n.rnn_layers import *
from cs231n.layer_utils import *

class CaptioningRNN(object):
  """
  A CaptioningRNN produces captions from image features using a recurrent
  neural network.

  The RNN receives input vectors of size D, has a vocab size of V, works on
  sequences of length T, has an RNN hidden dimension of H, uses word vectors
  of dimension W, and operates on minibatches of size N.

  Note that we don't use any regularization for the CaptioningRNN.
  """
  
  def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128,
               hidden_dim=128, cell_type='rnn', dtype=np.float32):
    """
    Construct a new CaptioningRNN instance.

    Inputs:
    - word_to_idx: A dictionary giving the vocabulary. It contains V entries,
      and maps each string to a unique integer in the range [0, V).
    - input_dim: Dimension D of input image feature vectors.
    - wordvec_dim: Dimension W of word vectors.
    - hidden_dim: Dimension H for the hidden state of the RNN.
    - cell_type: What type of RNN to use; either 'rnn' or 'lstm'.
    - dtype: numpy datatype to use; use float32 for training and float64 for
      numeric gradient checking.
    """
    if cell_type not in {'rnn', 'lstm'}:
      raise ValueError('Invalid cell_type "%s"' % cell_type)
    
    self.cell_type = cell_type
    self.dtype = dtype
    self.word_to_idx = word_to_idx
    self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()}
    self.params = {}
    
    vocab_size = len(word_to_idx)

    self._null = word_to_idx['']
    self._start = word_to_idx.get('', None)
    self._end = word_to_idx.get('', None)
    
    # Initialize word vectors
    self.params['W_embed'] = np.random.randn(vocab_size, wordvec_dim)
    self.params['W_embed'] /= 100
    
    # Initialize CNN -> hidden state projection parameters
    self.params['W_proj'] = np.random.randn(input_dim, hidden_dim)
    self.params['W_proj'] /= np.sqrt(input_dim)
    self.params['b_proj'] = np.zeros(hidden_dim)

    # Initialize parameters for the RNN
    dim_mul = {'lstm': 4, 'rnn': 1}[cell_type]
    self.params['Wx'] = np.random.randn(wordvec_dim, dim_mul * hidden_dim)
    self.params['Wx'] /= np.sqrt(wordvec_dim)
    self.params['Wh'] = np.random.randn(hidden_dim, dim_mul * hidden_dim)
    self.params['Wh'] /= np.sqrt(hidden_dim)
    self.params['b'] = np.zeros(dim_mul * hidden_dim)
    
    # Initialize output to vocab weights
    self.params['W_vocab'] = np.random.randn(hidden_dim, vocab_size)
    self.params['W_vocab'] /= np.sqrt(hidden_dim)
    self.params['b_vocab'] = np.zeros(vocab_size)
      
    # Cast parameters to correct dtype
    for k, v in self.params.iteritems():
      self.params[k] = v.astype(self.dtype)


  def loss(self, features, captions):
    """
    Compute training-time loss for the RNN. We input image features and
    ground-truth captions for those images, and use an RNN (or LSTM) to compute
    loss and gradients on all parameters.
    
    Inputs:
    - features: Input image features, of shape (N, D)
    - captions: Ground-truth captions; an integer array of shape (N, T) where
      each element is in the range 0 <= y[i, t] < V
      
    Returns a tuple of:
    - loss: Scalar loss
    - grads: Dictionary of gradients parallel to self.params
    """
    # Cut captions into two pieces: captions_in has everything but the last word
    # and will be input to the RNN; captions_out has everything but the first
    # word and this is what we will expect the RNN to generate. These are offset
    # by one relative to each other because the RNN should produce word (t+1)
    # after receiving word t. The first element of captions_in will be the START
    # token, and the first element of captions_out will be the first word.
    captions_in = captions[:, :-1]
    captions_out = captions[:, 1:]
    #print "caption", printCaption(captions,self.idx_to_word)
    #print "caption_in", printCaption(captions_in,self.idx_to_word)
    #print "caption_out", printCaption(captions_out,self.idx_to_word)

    # You'll need this 
    mask = (captions_out != self._null)

    # Weight and bias for the affine transform from image features to initial
    # hidden state
    W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
    
    # Word embedding matrix
    W_embed = self.params['W_embed']

    # Input-to-hidden, hidden-to-hidden, and biases for the RNN
    Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

    # Weight and bias for the hidden-to-vocab transformation.
    W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
    
    loss, grads = 0.0, {}

    ############################################################################
    # TODO: Implement the forward and backward passes for the CaptioningRNN.   #
    # In the forward pass you will need to do the following:                   #
    # (1) Use an affine transformation to compute the initial hidden state     #
    #     from the image features. This should produce an array of shape (N, H)#
    # (2) Use a word embedding layer to transform the words in captions_in     #
    #     from indices to vectors, giving an array of shape (N, T, W).         #
    # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to    #
    #     process the sequence of input word vectors and produce hidden state  #
    #     vectors for all timesteps, producing an array of shape (N, T, H).    #
    # (4) Use a (temporal) affine transformation to compute scores over the    #
    #     vocabulary at every timestep using the hidden states, giving an      #
    #     array of shape (N, T, V).                                            #
    # (5) Use (temporal) softmax to compute loss using captions_out, ignoring  #
    #     the points where the output word is  using the mask above.     #
    #                                                                          #
    # In the backward pass you will need to compute the gradient of the loss   #
    # with respect to all model parameters. Use the loss and grads variables   #
    # defined above to store loss and gradients; grads[k] should give the      #
    # gradients for self.params[k].                                            #
    ############################################################################

    # START LOSS COMPUTATION
    # features (N, D)
    #print "features",features.shape

    #print "W_proj",W_proj.shape,"b_proj",b_proj.shape
    # (1)
    # Inputs
    # features: (N, D)
    # W_proj: (D, H)
    # b_proj: (H,)

    # Output
    # init_h: (N, H)
    # cache_inith: features, W_proj, b_proj
    init_h,cache_inith = affine_forward(features, W_proj, b_proj)

    #print "captions_in", captions_in.shape
    #print "W_embed", W_embed.shape
    # (2)
    # Inputs
    # captions_in: (N, T-1), last being cut
    # W_embed: (V, W)

    # Output
    # captions_in_vec: (N, T, W)
    # cache_captions_in_vec: x, W_embed, V, W
    captions_in_vec, cache_captions_in_vec = word_embedding_forward(captions_in, W_embed)

    #print "Wx", Wx.shape
    #print "Wh",Wh.shape
    #print "b",b.shape
    # (3)
    # Inputs
    # x: (N, T, W)
    # h0: init_h (N, H)
    # Wx: (W, H)
    # Wh: (H, H)
    # b: (H,)

    # Outputs
    # hs: (N, T, H)
    # cache_hs: list of dx, dprev_h, dWx, dWh, db
    if self.cell_type == "rnn":
      hs, cache_hs = rnn_forward(captions_in_vec, init_h, Wx, Wh, b)
    if self.cell_type == "lstm":
      hs, cache_hs = lstm_forward(captions_in_vec, init_h, Wx, Wh, b)

    #print "hs",hs.shape
    #print "W_vocab",W_vocab.shape
    #print "b_vocab",b_vocab.shape
    # (4)
    # Inputs
    # hs: (N, T, H)
    # W_vocab: (H, V)
    # b_vocab: (V)

    # Output
    # vocab_scores: (N ,T ,V)
    # vocab_scores_cache: hs, W_vocab, b_vocab, vocab_scores
    vocab_scores, vocab_scores_cache = temporal_affine_forward(hs, W_vocab, b_vocab)

    # (5)
    # Inputs
    # vocab_scores: (N ,T ,V)
    # captions_out: (N ,T ,V)

    # Output
    # loss: float
    # dout: (N ,T ,V)
    loss, dout = temporal_softmax_loss(vocab_scores, captions_out, mask)

    # START GRADIENT COMPUTATION
    # (4)
    dhs, dW_vocab, db_vocab = temporal_affine_backward(dout, vocab_scores_cache)

    # (3)
    if self.cell_type == "rnn":
      dcaptions_in_vec, dinit_h, dWx, dWh, db = rnn_backward(dhs, cache_hs)
    if self.cell_type == "lstm":
      dcaptions_in_vec, dinit_h, dWx, dWh, db = lstm_backward(dhs, cache_hs)

    # (2)
    dW_embed = word_embedding_backward(dcaptions_in_vec, cache_captions_in_vec)

    # (1)
    dfeatures, dW_proj, db_proj = affine_backward(dinit_h, cache_inith)

    grads['W_vocab'] = dW_vocab
    grads['b_vocab'] = db_vocab

    grads['Wx'] = dWx
    grads['Wh'] = dWh
    grads['b'] = db

    grads['W_embed'] = dW_embed
    grads['W_proj'] = dW_proj
    grads['b_proj'] = db_proj

    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    return loss, grads


  def sample(self, features, max_length=30):
    """
    Run a test-time forward pass for the model, sampling captions for input
    feature vectors.

    At each timestep, we embed the current word, pass it and the previous hidden
    state to the RNN to get the next hidden state, use the hidden state to get
    scores for all vocab words, and choose the word with the highest score as
    the next word. The initial hidden state is computed by applying an affine
    transform to the input image features, and the initial word is the 
    token.

    For LSTMs you will also have to keep track of the cell state; in that case
    the initial cell state should be zero.

    Inputs:
    - features: Array of input image features of shape (N, D).
    - max_length: Maximum length T of generated captions.

    Returns:
    - captions: Array of shape (N, max_length) giving sampled captions,
      where each element is an integer in the range [0, V). The first element
      of captions should be the first sampled word, not the  token.
    """
    N = features.shape[0]
    captions = self._null * np.ones((N, max_length), dtype=np.int32)
    print "N",N
    print "max_length",max_length
    print "captions",captions

    # set first word 'START'
    for i in range(0, N):
        captions[i, 0]= self._start

    # Unpack parameters
    W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
    W_embed = self.params['W_embed']
    Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
    W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
    
    ###########################################################################
    # TODO: Implement test-time sampling for the model. You will need to      #
    # initialize the hidden state of the RNN by applying the learned affine   #
    # transform to the input image features. The first word that you feed to  #
    # the RNN should be the  token; its value is stored in the         #
    # variable self._start. At each timestep you will need to do to:          #
    # (1) Embed the previous word using the learned word embeddings           #
    # (2) Make an RNN step using the previous hidden state and the embedded   #
    #     current word to get the next hidden state.                          #
    # (3) Apply the learned affine transformation to the next hidden state to #
    #     get scores for all words in the vocabulary                          #
    # (4) Select the word with the highest score as the next word, writing it #
    #     to the appropriate slot in the captions variable                    #
    #                                                                         #
    # For simplicity, you do not need to stop generating after an  token #
    # is sampled, but you can if you want to.                                 #
    #                                                                         #
    # HINT: You will not be able to use the rnn_forward or lstm_forward       #
    # functions; you'll need to call rnn_step_forward or lstm_step_forward in #
    # a loop.                                                                 #
    ###########################################################################
    # image features to initial hidden state

    init_h, cache_inith = affine_forward(features, W_proj, b_proj)
    captions_in_vec, cache_captions_in_vec = word_embedding_forward(captions, W_embed)
    prev_h = init_h
    prev_c = np.zeros(prev_h.shape)
    for t in range (0, max_length-1):
      if self.cell_type == "rnn":
        h, _ = rnn_step_forward(captions_in_vec[:,t,:], prev_h, Wx, Wh, b)
      if self.cell_type == "lstm":
        h, next_c, _ = lstm_step_forward(captions_in_vec[:,t,:], prev_h, prev_c, Wx, Wh, b)
      N, D = h.shape
      h = h.reshape((N, 1, D))
      vocab_scores, _ = temporal_affine_forward(h, W_vocab, b_vocab)
      highest_prob_vocab_scores = np.argmax(vocab_scores, axis=2)
      captions[:, t+1] = highest_prob_vocab_scores.reshape(N)
      prev_h = h.reshape((N,D))
      if self.cell_type == "lstm":
        prev_c = next_c
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    return captions

Saliency map

def compute_saliency_maps(X, y, model):
  """
  Compute a class saliency map using the model for images X and labels y.
  
  Input:
  - X: Input images, of shape (N, 3, H, W)
  - y: Labels for X, of shape (N,)
  - model: A PretrainedCNN that will be used to compute the saliency map.
  
  Returns:
  - saliency: An array of shape (N, H, W) giving the saliency maps for the input
    images.
  """
  saliency = None
  ##############################################################################
  # TODO: Implement this function. You should use the forward and backward     #
  # methods of the PretrainedCNN class, and compute gradients with respect to  #
  # the unnormalized class score of the ground-truth classes in y.             #
  ##############################################################################
  pretrainedCNN = model
  unnormalized_class_score, cache = pretrainedCNN.forward(X)
  dout = np.zeros(unnormalized_class_score.shape)
  dout[np.arange(dout.shape[0]),y] = 1
  dX, grads = pretrainedCNN.backward(dout, cache)
  saliency = np.max(dX, axis=1)
  ##############################################################################
  #                             END OF YOUR CODE                               #
  ##############################################################################
  return saliency
def make_fooling_image(X, target_y, model):
  """
  Generate a fooling image that is close to X, but that the model classifies
  as target_y.
  
  Inputs:
  - X: Input image, of shape (1, 3, 64, 64)
  - target_y: An integer in the range [0, 100)
  - model: A PretrainedCNN
  
  Returns:
  - X_fooling: An image that is close to X, but that is classifed as target_y
    by the model.
  """
  X_fooling = X.copy()
  ##############################################################################
  # TODO: Generate a fooling image X_fooling that the model will classify as   #
  # the class target_y. Use gradient ascent on the target class score, using   #
  # the model.forward method to compute scores and the model.backward method   #
  # to compute image gradients.                                                #
  #                                                                            #
  # HINT: For most examples, you should be able to generate a fooling image    #
  # in fewer than 100 iterations of gradient ascent.                           #
  ##############################################################################
  print 'y',y
  print 'target_y',target_y
  X1 = X.copy()
  for i in range(0,100):
    print "i =",i,
    unnormalized_class_score, cache = model.forward(X1)
    print "cur score =",unnormalized_class_score[0][target_y]," max =",np.max(unnormalized_class_score)
    # increasing the target_score while decreasing the rest 
    dout = np.full(unnormalized_class_score.shape,-1)
    dout[np.arange(dout.shape[0]),target_y] = 1
    
    dX, grads = model.backward(dout, cache)
    # do not forget the learning rate!
    X1 += 1000 * dX
    if np.argmax(unnormalized_class_score,axis=1) == target_y:
        print "fooling success at i =",i
        X_fooling = X1
        break
  ##############################################################################
  #                             END OF YOUR CODE                               #
  ##############################################################################
  return X_fooling

class visulization

def create_class_visualization(target_y, model, **kwargs):
  """
  Perform optimization over the image to generate class visualizations.
  
  Inputs:
  - target_y: Integer in the range [0, 100) giving the target class
  - model: A PretrainedCNN that will be used for generation
  
  Keyword arguments:
  - learning_rate: Floating point number giving the learning rate
  - blur_every: An integer; how often to blur the image as a regularizer
  - l2_reg: Floating point number giving L2 regularization strength on the image;
    this is lambda in the equation above.
  - max_jitter: How much random jitter to add to the image as regularization
  - num_iterations: How many iterations to run for
  - show_every: How often to show the image
  """
  
  learning_rate = kwargs.pop('learning_rate', 10000)
  blur_every = kwargs.pop('blur_every', 1)
  l2_reg = kwargs.pop('l2_reg', 1e-6)
  max_jitter = kwargs.pop('max_jitter', 4)
  num_iterations = kwargs.pop('num_iterations', 100)
  show_every = kwargs.pop('show_every', 25)
  
  X = np.random.randn(1, 3, 64, 64)
  for t in xrange(num_iterations):
    # As a regularizer, add random jitter to the image
    ox, oy = np.random.randint(-max_jitter, max_jitter+1, 2)
    X = np.roll(np.roll(X, ox, -1), oy, -2)

    dX = None
    ############################################################################
    # TODO: Compute the image gradient dX of the image with respect to the     #
    # target_y class score. This should be similar to the fooling images. Also #
    # add L2 regularization to dX and update the image X using the image       #
    # gradient and the learning rate.                                          #
    ############################################################################
    unnormalized_class_score, cache = model.forward(X)
    dout = np.full(unnormalized_class_score.shape,0)
    dout[np.arange(dout.shape[0]),target_y] = 1
    dX, grads = model.backward(dout, cache)
    dX += 2 * l2_reg * X
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # Undo the jitter
    X = np.roll(np.roll(X, -ox, -1), -oy, -2)
    
    # As a regularizer, clip the image
    X = np.clip(X, -data['mean_image'], 255.0 - data['mean_image'])
    
    # As a regularizer, periodically blur the image
    if t % blur_every == 0:
      X = blur_image(X)
    
    # Periodically show the image
    if t % show_every == 0:
      plt.imshow(deprocess_image(X, data['mean_image']))
      plt.gcf().set_size_inches(3, 3)
      plt.axis('off')
      plt.show()
  return X

Feature Inversion
shallow

def invert_features(target_feats, layer, model, **kwargs):
  """
  Perform feature inversion in the style of Mahendran and Vedaldi 2015, using
  L2 regularization and periodic blurring.
  
  Inputs:
  - target_feats: Image features of the target image, of shape (1, C, H, W);
    we will try to generate an image that matches these features
  - layer: The index of the layer from which the features were extracted
  - model: A PretrainedCNN that was used to extract features
  
  Keyword arguments:
  - learning_rate: The learning rate to use for gradient descent
  - num_iterations: The number of iterations to use for gradient descent
  - l2_reg: The strength of L2 regularization to use; this is lambda in the
    equation above.
  - blur_every: How often to blur the image as implicit regularization; set
    to 0 to disable blurring.
  - show_every: How often to show the generated image; set to 0 to disable
    showing intermediate reuslts.
    
  Returns:
  - X: Generated image of shape (1, 3, 64, 64) that matches the target features.
  """
  learning_rate = kwargs.pop('learning_rate', 10000)
  num_iterations = kwargs.pop('num_iterations', 500)
  l2_reg = kwargs.pop('l2_reg', 1e-7)
  blur_every = kwargs.pop('blur_every', 1)
  show_every = kwargs.pop('show_every', 50)
  
  X = np.random.randn(1, 3, 64, 64)
  for t in xrange(num_iterations):
    ############################################################################
    # TODO: Compute the image gradient dX of the reconstruction loss with      #
    # respect to the image. You should include L2 regularization penalizing    #
    # large pixel values in the generated image using the l2_reg parameter;    #
    # then update the generated image using the learning_rate from above.      #
    ############################################################################
    out, cache = model.forward(X, end=layer)
    dout = 2*(target_feats - out)
    dX, _ = model.backward(dout, cache)
    dX += 2 * l2_reg * X
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # As a regularizer, clip the image
    X = np.clip(X, -data['mean_image'], 255.0 - data['mean_image'])
    
    # As a regularizer, periodically blur the image
    if (blur_every > 0) and t % blur_every == 0:
      X = blur_image(X)

    if (show_every > 0) and (t % show_every == 0 or t + 1 == num_iterations):
      plt.imshow(deprocess_image(X, data['mean_image']))
      plt.gcf().set_size_inches(3, 3)
      plt.axis('off')
      plt.title('t = %d' % t)
      plt.show()

deep

def invert_features(target_feats, layer, model, **kwargs):
  """
  Perform feature inversion in the style of Mahendran and Vedaldi 2015, using
  L2 regularization and periodic blurring.
  
  Inputs:
  - target_feats: Image features of the target image, of shape (1, C, H, W);
    we will try to generate an image that matches these features
  - layer: The index of the layer from which the features were extracted
  - model: A PretrainedCNN that was used to extract features
  
  Keyword arguments:
  - learning_rate: The learning rate to use for gradient descent
  - num_iterations: The number of iterations to use for gradient descent
  - l2_reg: The strength of L2 regularization to use; this is lambda in the
    equation above.
  - blur_every: How often to blur the image as implicit regularization; set
    to 0 to disable blurring.
  - show_every: How often to show the generated image; set to 0 to disable
    showing intermediate reuslts.
    
  Returns:
  - X: Generated image of shape (1, 3, 64, 64) that matches the target features.
  """
  learning_rate = kwargs.pop('learning_rate', 10000)
  num_iterations = kwargs.pop('num_iterations', 500)
  l2_reg = kwargs.pop('l2_reg', 1e-7)
  blur_every = kwargs.pop('blur_every', 1)
  show_every = kwargs.pop('show_every', 50)
  
  X = np.random.randn(1, 3, 64, 64)
  for t in xrange(num_iterations):
    ############################################################################
    # TODO: Compute the image gradient dX of the reconstruction loss with      #
    # respect to the image. You should include L2 regularization penalizing    #
    # large pixel values in the generated image using the l2_reg parameter;    #
    # then update the generated image using the learning_rate from above.      #
    ############################################################################
    out, cache = model.forward(X, end=layer)
    dout = 2*(target_feats - out)
    dX, _ = model.backward(dout, cache)
    dX += 2 * l2_reg * X
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # As a regularizer, clip the image
    X = np.clip(X, -data['mean_image'], 255.0 - data['mean_image'])
    
    # As a regularizer, periodically blur the image
    if (blur_every > 0) and t % blur_every == 0:
      X = blur_image(X)

    if (show_every > 0) and (t % show_every == 0 or t + 1 == num_iterations):
      plt.imshow(deprocess_image(X, data['mean_image']))
      plt.gcf().set_size_inches(3, 3)
      plt.axis('off')
      plt.title('t = %d' % t)
      plt.show()

deep dream

def deepdream(X, layer, model, **kwargs):
  """
  Generate a DeepDream image.
  
  Inputs:
  - X: Starting image, of shape (1, 3, H, W)
  - layer: Index of layer at which to dream
  - model: A PretrainedCNN object
  
  Keyword arguments:
  - learning_rate: How much to update the image at each iteration
  - max_jitter: Maximum number of pixels for jitter regularization
  - num_iterations: How many iterations to run for
  - show_every: How often to show the generated image
  """
  
  X = X.copy()
  
  learning_rate = kwargs.pop('learning_rate', 5.0)
  max_jitter = kwargs.pop('max_jitter', 16)
  num_iterations = kwargs.pop('num_iterations', 100)
  show_every = kwargs.pop('show_every', 25)
  
  for t in xrange(num_iterations):
    # As a regularizer, add random jitter to the image
    ox, oy = np.random.randint(-max_jitter, max_jitter+1, 2)
    X = np.roll(np.roll(X, ox, -1), oy, -2)

    dX = None
    ############################################################################
    # TODO: Compute the image gradient dX using the DeepDream method. You'll   #
    # need to use the forward and backward methods of the model object to      #
    # extract activations and set gradients for the chosen layer. After        #
    # computing the image gradient dX, you should use the learning rate to     #
    # update the image X.                                                      #
    ############################################################################
    out, cache = model.forward(X, end=layer)
    dout = out
    dX, _ = model.backward(dout, cache)
    X += learning_rate * dX
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    
    # Undo the jitter
    X = np.roll(np.roll(X, -ox, -1), -oy, -2)
    
    # As a regularizer, clip the image
    mean_pixel = data['mean_image'].mean(axis=(1, 2), keepdims=True)
    X = np.clip(X, -mean_pixel, 255.0 - mean_pixel)
    
    # Periodically show the image
    if t == 0 or (t + 1) % show_every == 0:
      img = deprocess_image(X, data['mean_image'], mean='pixel')
      plt.imshow(img)
      plt.title('t = %d' % (t + 1))
      plt.gcf().set_size_inches(8, 8)
      plt.axis('off')
      plt.show()
  return X




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

what's new today

each neuron serves as a linear classifier, each layer tries to transform the input data so that it's separable for the next layer
for 2 dimensional input, we need THREE neurons for the first layer to make it separable for the next layer


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

Geometric interpretation of Ax and Eigen-decomposition of a diagonal matrix

Geometric interpretation of Ax:
Suppose A is a 2 \times 2 matrix, \lambda_1 and \lambda_2 are two eigenvalues and v_1 and v_2 are corresponding eigenvectors.
Since v_1 and v_2 are independent, it forms a base of 2 dimensional space, so x = a_1v_1 + a_2v_2.

Ax = A(a_1v_1 + a_2v_2)=a_1 \lambda_1 v_1 + a_2 \lambda_2 v_2=\lambda_1 a_1 v_1+\lambda_2 a_2 v_2


So we can conclude that x is scaled in two direction v_1 and v_2 with magnitude \lambda_1 and \lambda_2.
Cool.

Eigen-decomposition of a diagonal matrix:

D=IDI^{-1}

, so the eigenvectors are u_1, u_2 ...
Cool.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2017.1.9-2017.1.15

1.
Adaboost theory
《The Elements of Statistical Learning》, Ch10.1-10.4
@TODO
a) A summary on it(done)
b) adaboost for regression
c) adaboost for multiclass

2.
No free lunch theorem: there is no priori distinction between any ml algo, how well it performs is determined by how well the hypothesis fits the reality, namely P(h|d) fits P(f|d), where d is training data and h the hypothesis and f the relationship between input and output.

http://www.no-free-lunch.org/

3.
Decision Boundary belongs to the hypothesis, not the training data, because different hypothesis will give different decision boundary on the same training data.



xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2016.1.2-2017.1.8

1.
@TOREAD

http://www.alfredo.motta.name/cross-validation-done-wrong/

2.
pip runs behind a proxy
. set http_proxy and https_proxy environment correctly enables it

PyPI is for Python Packages Index, used by pip

3.
[[
1.
Tags:cross validation, imbalanced data
An excellent article about how to properly do cross-validation on imbalanced data.
how to properly do cross-validation on imbalanced data
@TODO reproduce the result
]]
while trying to reproduce the result I find what I thought about the 2 following things are wrong:
a) doing oversampling properly doesn't increase the evaluations result at all, it's doing it wrong produces a faked high result
b) the max value of the parameter 'max-depth' in random forest isn't the total feature number as the feature can be reused when constructing the tree

4.
reproduce Deep Learning Book -> Numerical Computation -> refinement of softmax function -> refinement of log softmax

5.
classifier tests

http://www.jmlr.org/papers/volume15/delgado14a/delgado14a.pdf

@TODO
A summary on it
@TODO python classfier comparison plot

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

2016.12.26-2017.1.1

1.
Tags:cross validation, imbalanced data
An excellent article about how to properly do cross-validation on imbalanced data.
how to properly do cross-validation on imbalanced data
@TODO: reproduce the result

2.
Tags:life, health, food
Didn't prepare much on closes at Sunday -> Monday morning too busy to forget the food cart -> Buy creamy bread instead -> Cause diarrhea

3.
Tags:life, health
Sleep at night around 11pm makes stomach feel good the next day

4.
Tags:psychology
One way to define character is given a specific situation and see what behavior is invoked. From this point of view, it is reckless to simply label someone with extroverted as he or she may behave quite differently in some situation, say in a introverted way. I think situation-based thinking is not bad, anything that we may use as a mean later should be observed, learned and perceived under a specific situation.


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

test

object Ch2 extends App {
  def e_2_1(n:Int):Int = {
    def fib(n:Int): Int = {
      if(n == 0) 0
      else if(n==1) 1
      else fib(n-1) + fib(n-2)
    }
    fib(n)
  }

  println(e_2_1(0))
  println(e_2_1(1))
  println(e_2_1(2))
  println(e_2_1(3))
  println(e_2_1(4))
}




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

kafka design(未完成)

翻译一下kafka官网文档下设计这一章节。括号内是我的补充。

原文链接

4.设计
4.1 动机
我们设计kafka是为了使其能成为一个可以处理大公司量级实时数据的统一平台。因此我们考虑了比较广泛的使用场景。

它需要有高吞吐来支持诸如实时日志聚合这样的大容量事件流。

它需要优雅的处理大量存量数据以支持离线系统发起的周期性的数据加载请求。

它也需要以低延迟来支撑传统消息处理场景。

我们想(让消费者)可以分区的、分布式的、实时处理这些数据,并由此生产新的衍生数据。由此才有分区的概念以及消费者模型。

最后,在这些流被其他数据系统服务使用时,我们希望它可以在机器出现异常的时候兼具容错性。

为了支持这些场景,我们以一些全新的元素设计了这个系统,相较于传统的消息系统,它更像一个日志数据库。我们将在下面的几章简要描述其中的一些设计思想。

4.2 持久化

不要畏惧文件系统!

Kafka在存储和缓存消息方面非常依赖文件系统。人们一般都觉得“磁盘慢”,所以对这样的持久化方式是否能给出一个具有竞争力的性能存疑。事实上,磁盘可以说是既快又慢,关键取决于你怎么使用。一个设计良好的磁盘结构可以和网络一样快。

过去十年中,磁盘性能的关键点在于它的吞吐量受磁盘寻道的延迟影响很大。这导致对于一块7200转 STATA RAID-5阵列的磁盘来说,线性写的速度可以达到600MB/s,但随机写只有100K/s,6000倍的差距。这些线性的读写是所有使用模式里最可预测的(可以覆盖大部分使用场景),也是被操作系统深度优化了的。现代操作系统可以做到“头读尾写”,这种技术可以对大量磁盘块进行预读并可以将许多琐屑的逻辑写合并成一个更大的物理写。更多关于这个话题的讨论可以参看ACM Queue article,事实上他们发现顺序磁盘访问在某些情况下比内存访问还要快。

为了弥补这种性能上的差异,现代操作系统越来越多的使用主内存来缓存磁盘。操作系统可以愉快地将所有空闲内存用于磁盘缓存,这些内存不再空闲时(被使用)时性能代价也很小。磁盘所有的读和写都通过这个统一的缓存。这个特性在不使用direct I/O的情况下很难被关掉,即使进程里已经缓存了一份数据,这份数据在操作系统缓存页里也会有一份,事实上每个东西都存了两份。

进一步说,我们的系统构建在JVM上,任何稍微在Java内存上花过时间的人都知道以下两点

1. 对象的memory overhead很高,经常高达2倍于数据本身的大小

2. 随着heap上数据越来越多,GC变的越来越需要调教和缓慢

因此,使用文件系统并依赖于缓存页就比在维护一个内存中缓存要好——通过天生的对所有空闲内存的访问能力我们至少使可用内存翻了倍,通过使用更紧致的字节结构而不是每个对象(本身的结构)我们可能又(使可用内存的数量)翻了倍。这么做在32G的机器上我们有28-30G的可用内存,而且GC的副作用。而且这些缓存即便在服务重启后也仍然是温暖的(可用的),但假如是进程中的缓存的话则需要重新构建,10GB需要花费10分钟,或者以一片空的缓存开始,这样的话可能开始的(命中)效率会非常低。相较于进程中内存的处理方法,这大大简化了维护缓存与文件系统一致性代码的复杂程度。如果你的磁盘倾向于线性读,那么read-ahead实际上就是把每次磁盘读中的有用的数据提前读进缓存的过程。

这就提供了一个很简单的设计思路:与其维护一个尽可多的内存并在它不足时悲催地写入文件系统,我们把它反过来。所有数据无需写入磁盘而是立即写入文件系统中的持久化日志文件。实际上就是说数据直接被转移到了系统核管理的缓存页。

在这篇Varish设计(适度傲娇)中描述了这种集中式缓存页的设计。

常量时间让你满意

消息系统中的用于持久化的数据结构通常是给每一个消费者一个队列并关联一个BTree或者是其它普通的支持随机访问的数据结构用以存储消息元数据。BTree是可用的最风骚的数据结构,有了它,消息系统中事务、非事务的不同的语义都能够被支持。虽然代价不菲:BTree上的操作是O(logN).一般来说O(logN)被认为和常量等同(在内存里),但对于磁盘操作来说不是这样。一个磁盘寻道的操作一般耗时10ms,一个磁盘同一时间只能做一次寻道操作,因此并行程度很有限。所以即便一小撮磁盘寻道操作仍然会导致很高性能负担。因为存储系统是混合了很快的缓存操作和很慢的物理磁盘操作,所以在缓存一定的情况下,观测到的性能经常不是线性的,比如数据翻倍导致耗时远远大于2倍。

像写日志那样,直觉上持久化队列应该建立在简单读和文件追加上。这种结构的优势是所有操作都是O(1)的复杂度,并且读也不会阻塞写,相互之间也不会。这就带来明显的优势,因为性能完全和数据大小无关了,一台服务器现在可以充分利用一些价格低、转速慢,1TB左右的SATA盘了。虽然这些硬盘寻道性能不行,但在大量数据的读写方面表现还是令人接受的,同时只要三分之一的钱和3倍的容量(相对于内存)。

在没有任何性能惩罚的条件下拥有对几乎无尽的磁盘的访问能力意味着我们可以提供一般消息系统并不具备的特性。比如,在Kafka里,我们能够将消息在相对较长的一段时间内保存起来,而不是在它被消费者使用后就尝试删掉。这给消费者带来了不小的便利,正如下面我们描述的那样。

4.3 效率

我们在性能上花了大力气。我们主要的使用场景之一就是处理网页活动数据:每次页面浏览可能产生不少写操作。我们也假设每条消息至少被一个消费者读到,所以我们致力于让消费过程尽量简单。

从搭建和运行一些类似的系统中获取的经验中我们也发现,效率是多租户操作的关键。如果下游基础设施程序(这里指kafka)轻易就因为小问题而成为瓶颈,这些小问题就变成大问题了。所以通过将系统设计的高效,我们保证在(高)负载的情况下应用程序会比基础设施程序先奔溃。这对于一个需要同时支撑几百个使用模式几乎每天都变化的应用程序的中心化服务来说是特别重要的。

我们在前面几部分讨论了磁盘的效率,一旦低效的磁盘操作模式被消灭,那么在此类系统中导致低效的就只剩下两个原因了:过多的小I/O操作核过度的字节拷贝。

过多的小I/O操作不仅发生在客户端和服务器之间也发生在服务器自身持久化操作的时候。

为了避免上述问题,我们的协议构建在把消息聚合的“消息集合”这个抽象之上。这让网络可以一次请求一个消息组,这相比于一次只发送一条消息减少了网络上来回的开销。服务器因此也是一次把一整块消息追加到日志里,消费者也是一次获取一大块线性块。

这种简单的优化产生了数量级上的速度提升。批处理模式带来了更大的网络数据包,更大的顺序磁盘操作,相邻的内存快等等,所有这些都帮助kafka把井喷式的流数据写转化为流向消费者的线性写。

另一个低效来自于字节拷贝。如果消息发送的频率较慢这没什么问题,但在高负载下影响就很大了。为了避免这个问题出现,我们在生产者、中介者和消费者之间使用了标准化的消息格式,这样数据在它们之间无需修改就可以传输了。

中介者维护的消息日志自身只是一个包含文件的目录,每个文件由一个已经写入磁盘的消息集合序列填充,这些消息的格式与生产者和消费者使用的没有区别。维护这么一个共用的格式使得一个最重要的操作优化成为可能:网络传输持久化的日志块。现代Unix操作系统为数据从缓存页到套接字的传输提供了一个高度优化的代码路径,在Linux中它是通过sendfiles系统调用来完成的。

要想了解sendfile的影响,了解数据从文件到套接字的一般传输路径就很重要了:

1. 操作系统从磁盘读取数据到内核空间的缓存页

2. 应用程序从内核空间将数据都到用户空间的缓存中

3. 应用程序将数据写回内核空间最终写道套接字的缓存中

4. 操作系统从套接字的缓存中将数据拷贝到NIC缓存中准备发送

这明显不高效,存在4次拷贝以及2次系统函数调用,使用sendfile, 这样的重复拷贝通过让操作系统直接将数据从缓存页拷贝到网络避免。所以在这个优化路径上,只有最后到NIC缓存的拷贝是必须的。

我们预期一个主题多个消费者是一个常见的使用场景。通过使用上述的零拷贝优化,数据只被精确地拷贝到缓存页1次,可以重复使用而不是存在内存中,当被需要读的时候每次拷贝到内核空间。这样是的消息的消费速度几乎和它在网络的传输速度一样快。

缓存页与sendfile的组合使用意味着在Kafka集群里,在消费者的集中区,你也看不到磁盘上有任何活动迹象,因为数据直接从缓存走了。

更多关于Java中sendfile与零拷贝的背景信息,参考这里。

点对点的批压缩

某些情况下瓶颈实际上不是来自内存或磁盘而是由带宽导致的。这对于需要将数据通过广域网传输的数据处理流程来显得尤其正确。诚然用户总是可以在不借助Kafka的情况对数据进行压缩,但这会导致很低的压缩率,因为相同类型消息的重复的部分的被重复压缩了(比如JSON中的字段名或者网页日志中的用户代理或者公用的字符串)。高效的压缩要求对多条数据整体压缩而不是逐条压缩。

Kafka通过允许循环的消息集合来支持上述压缩。消息可以按批压缩并以这种形式传到服务器,再以这种形式写到磁盘上并保持这种状态,只有被消费者消费时才会解压。

Kafka支持GZip,Snappy和LZ4压缩协议。更多关于压缩的细节可以在这里被找到。

4.4  生产者

负载均衡

生产者在没有任何路由层介入的情况下直接将数据发送到当前分区的指挥者上。为了帮助生产者做到这点,Kafka节点可以随时响应针对诸如哪些服务器还在工作或是某个分区的指挥者是谁这样的元数据的请求。

客户端可以控制消息要发布到哪个分区上。可以随机,实现某种随机的负载均衡,或者给定某个带有语义的分区函数。我们暴露一个带有语义的分区接口给客户,这个接口允许用户指定用作哈西的分区键(如果需要也有重写分区函数的选择)例如如果用用户ID作为分区键,那么相同的用户就会被分到同一个分区上。这反过来也允许消费对它们消费行为作出位置假设。这样的分区设计就是为了显示支持位置敏感的消费者的。

异步发送

批处理是高效的一个主要动力源,为了能够批处理,Kafka的生产者会试图在内存中积累数据然后在一次请求中发送出更多的数据。批处理可以被设置为不超过某个指定的消息条数或是不超过某个指定的延时,比如64K或者10ms。这可以积累更多要发送的字节,以及在服务端更少的I/O操作。这种缓冲是可配置的并提供一种机制可以用稍微多一点的延迟去换取更高的吞吐。

关于生产者配置和接口的细节可以在这篇文档的其它地方找到。

4.5 消费者

Kafka消费者通过向中介者发起“获取”某个分区的请求来工作。消费者在请求中指定了(要获取的内容在)日志的偏移位置,接受从那个偏移位置开始之后的一块数据。因此,消费者通过这个偏移位置(对怎么消费数据)有很大的掌控度,在必要的时候可以倒回这个位置重新消费数据。

推 vs. 拉

我们考虑的一个初始问题是到底是让消费者从中介者那里拉数据呢还是中介者往消费者那里推送数据。这点上Kafka遵循了更为传统的设计思想,这样的思想被大多数消息系统所共享,即数据是被生产者推送至中介者又被消费者从中介者拉出来的。一些集中式的日志系统,如Scribe和Apache Flume,遵循了一种非常不同的基于推送的设计,这种设计里数据是被推送到下游的。这两种方法各有利弊。然而,基于推送的系统在处理不同类型的消费者时会有困难因为中介者控制数据的传输速率。对于消费者来说目标一般是能以最快的速度进行消费,不幸的是,在推送系统中,这意味着当消费速度低于生产速度时(本质上是一种DOS)消费者倾向于奔溃。一个基于拉的系统有一个很好的特性,消费者落后之后只要在有能力时赶上即可。这样的情况可以通过某种标明消费者已经崩溃的退避协议减轻,但要达到充分使用消费者那样的传输速率要比看起来复杂许多。之前用这种方式(基于推送)构建系统的尝试促使我们选择更传统的拉模型。

基于拉的系统的另一个优点是使其适合将要发送给消费者的数据批量化。基于推送的系统必须选择要么立即发送消息,要么积累更多的数据然后在对消费者是否有能力立即处理这些数据毫不知情的情况下过一会再发。如果将延时调低,这会导致一次只发一条消息也会以缓存结束,太浪费。基于拉的系统纠正了这个问题因为消费者总是会将所有偏移位置之后的可用的消息(或是在某个可配置的最大值之内的)一次拉出。因此人们在没有引入没必要的延迟的情况下就可以获得较优批量数据。

一个原始的基于拉的系统的一个缺点是如果中介者没数据,消费者可能会以一个密集循环的轮询告终,事实上就是在忙碌等待数据的到来。为了避免这种情况,在拉请求中我们加入了一些参数,允许消费者在漫长的轮询等待中阻塞直到有数据到达(也可以等待到一个给定数量的字节到达以确保一个大的传输大小)。

你可以想象其它端到端只包含拉的设计。生产者在本地写日志,中介者从中拉数据,消费者再从前者拉数据。一种类似的“先存再推”的生产者经常被提出。这挺迷人但我们觉得它并不适用于我们有几千个生产者的场景。我们运行大规模数据持久化系统的经历使我觉在系统中存在许多应用程序,涉及几千块磁盘事实上并不会使事情变得更可靠,运营却变成了噩梦。实践中我们发现不需要生产侧的持久化,我可以运行大规模有着可靠的服务水平保证的数据处理流程。

消费者位置

出人意料的,记录哪些已经被消费是消息系统中关键的性能点。

大多数消息系统保存了消息是否在中介者上已经被消费的元数据。也就是说,随着消息分发给了消费者,中介者要么立刻在本地记录这个事实要么等待消费者的收到确认。这是相当直觉的选择,对于单节点的服务器来说这样的状态(消息是否被消费)也没。因为许多消息系统中的数据层都不容易扩展,这也是实用的选择,既然中介者知道哪些被消费了,它就可以立即将其删除,维持一个较小的数据规模。

4.6 消息传递语义

既然我们已经知道生产者和消费者如何工作了,我们来讨论一下Kafka提供的在生产者与消费者之间的语义保证。显然,存在几种可能的保证:

最多一次——消息可能丢失但绝不会重发

至少一次——消息从不丢失但可能重发

只有一次——每条消息会被传递且只会传递一次

值得注意的是这分解为两个问题:发布消息的耐久性保证与消费一条消息的保证

许多系统声称提供“只有一次”的传递语义,但重要的是去读小号字体的印刷条款,这些声称中许多带有误导性(如它们不能被用在消费者和生产者失效的场景、多个消费者的场景,或者写到磁盘上的数据可能丢失的场景)。

Kafka的语义很直接。当发布消息时我们有一个消息被提交到日志的概念。一旦发布的消息被提交了,只要一个写入这条消息的分区所在的中介者还活着它就不会丢失。关于活着的定义以及我们能处理哪类故障的细节我们将在下节讨论。当前我们假设一个完美的不会丢失的中介者,尝试去理解生产者与消费者之间的保证。如果消费者尝试发布一条消息但遇到网络故障,我们不能判断错误是在否在消息被提交前还是提交后发生。这和往数据库表里插入一条带自动生成键记录的语义类似??。

对于发布者来说,这不是最强的语义。尽管我们不能确定在发生网络故障的时候会发生什么,但还是可能让生产者生成某种主键让其重新尝试的生成请求是幂等的。对于一个具备冗余的系统来说这个特性不是无意义的因为即便(尤其)在服务器挂掉的时候系统应该仍能工作。有了这个特性,只要生产者不断重试直到收到消息以被提交的确认,与此同时我们就保证了消息只被发送了一次。我们希望在Kafka未来的版本中加入它(还没加入你说个毛线)。

不是所有的场景都要求有这么强的保证。对于延时敏感的用户我们允许生产者指定它们需要的耐久性等级。如果生产者指定它想等着消息被提交,这会花费10ms数量级。然而生产者也可以指定它希望消息可以完全异步发送或者它希望等到统筹者(leader)得到消息。

现在让我们站在消费者的角度来描述语义。所有荣誉都有完全一样的日志与偏移量。消费者控制日志里的偏移量。如果消费者从没崩溃过它可以将这个位置存于内存,但如果消费者故障了而我们希望这个主题分区被另一个进程接管,这个进程需要选择一个适当的位置以便开始处理。我们假设消费者读了一些消息——关于处理消息以及更新位置它有几种选择。

1。它可以读消息,在日志里存储位置,最后处理消息。这种情况下存在这样的可能,消费者处理流程在位置被存储后崩溃了,但消息处理后的输出还没有保存。这种情况下,接管的进程从被保存的位置开始处理即使这个位置之前的一些消息还没被处理。这与“最多一次”语义一致,因为在消费者崩溃的情况下消息没有被处理。

2。它可以读消息,处理消息,最后存储位置。这种情况下存在这样的可能,消费者在处理完消息后崩溃了,但还没来的及存储位置。此时接管的进程接收到的前几条消息都已经被处理过了。这和消费者崩溃情况下的“至少一次”语义一致。多数情况下消息有一个主键所以更新是幂等的(接受一条相同的消息两次只会用自身的拷贝来覆写记录)

3。那么“只发一次”呢(就是你们事实上想要的)?这里的限制事实上不是一个消息系统的特性而是协同消费者位置与实际上作为输出的存储内容的必要性。做到这点的经典做法是在消费者位置存储与消费者输出存储之间引入一个双阶段的提交。但这个通过让消费者将位置与输出存在一个地方可以更容易地被搞定。这更好因为许多消费想要写入的输出系统不支持双阶段提交。举个例子,将数据吐到HDFS上的Hadoop ETL将它读的数据和位置都存在HDFS上以便确保要么两者一起被读要么没有一个被读。我们对其他许多数据系统都遵循了类似的模式,这些系统要求更强的语义,对于写入的消息也要求没有主键来支持去重复。

所以实际上Kafka默认保证“至少一次”传递,并允许用户通过在生产者侧禁用重试并在批处理消息前提交位置实现“最多一次”。“只有一次”传递需要与目的地存储系统协同但Kafka提供了让实现这个直接的偏移量。

4.7 冗余

Kafka为主题的每个分区都在一个可配数量的服务器上做了备份(你可以为每个主题配置这个冗余因子)。这允许在一台服务器出现故障时自动使用这些备份以便消息仍然是可用的。

其它消息系统提供了一些冗余相关的特性,但是在我们(完全偏见)看来,它像是临时的产物,没有被着重使用,伴随着很大的副作用:从机是不活跃的,吞吐量受到了严重的影响,它需要高超手工调试,等等。Kafka默认伴随冗余使用——事实上我门实现无冗余的方式是设置冗余因子为一。

备份的基本单元是主题分区。在“无故障”条件下,Kafka的每个分区有一个leader以及零或者多个followers。包含leader的备份总数就是冗余因子。所有的读写都会发生在leader的partition上。很多时候,partition的数量比broker多很多,leaders均匀分布在brokers上。在follows上的log和leader上的一模一样——所有都拥有相同offsets和顺序一致的messages(虽然任意给定时间点下在log末尾leader会有一些还没有备份的messages).

和普通的Kafka消费者一样,followers从leader那里消费messages并将它们应用到自己的log上。让followers从leader那里拉(messages)有一个好处,允许followers自然的把log输入收集起来以便处理。

和大多数分布式系统一样自动故障处理要求对nodes“alive”是什么意思有一个精确的定义。对Kafka来说node的活性有两个条件

1.node必须能与Zookeeper保持session(经由Zookeeper的heartbeat机制)

2.如果它是slave,它必须要replicate在leader发生的writing而且不能“落后”太多

我们称满足这两个条件的nodes为"同步的",避免了"alive"或者"failed"中的空洞。leader时刻记录着一个同步nodes的集合。如果follower挂了,阻塞或者落后了,leader会将它从同步列表中移除。阻塞和落后的判定由replica.time.max.ms参数控制。

在分布式系统的术语里,我们只处理节点突然停止工作并且一会之后又恢复工作(可能不知道它出现过故障)这种“故障/恢复”,Kafka不能处理所谓的“拜占庭”式的故障,节点产生任意或恶意的回复(可能由错误或是恶作剧导致)。

一个message被认为是“committed”当这个patition所有同步备份已经应用于它们(followers)的log里。只有committed的messages才会被分发给consumer。这意味着consumer无须担心发现一个因为leader故障而可能丢失的message。Producers,另一方面,可以根据它们在latency和durability之间的权衡偏好选择是否等待message被committed。这个偏好被producer使用的acks setting控制。

Kafka提供的保证是committed messages不会丢失,只要始终至少有一个sync replica alive。

在出现node故障时,在一小段fail-over时间之后,Kafka将保持可用,但在出现网络分割时可能不能保证可用。


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

1米长绳,分3段,最短最长期望各多少问题

先看看2段

先看看分2段期望是多少。
首先是连续值的期望公式:

[math]E[x]=\int_a^bxf(x)dx [/math]

因为随机截取,所以截取点的[math]f(x)[/math]服从Uniform distribution。以短的那段为例,我们有:

[math]E[x_{min}]=\int_0^\frac12 x\frac1{\frac12-0}dx=\frac14[/math]

长的那段可以如法炮制。

 

3段

给个链接吧http://www.jlao.net/technology/1419/

花了一些时间看明白解法,一些关键点:
1. 三角形内的积分是[math]\int\int{y}f(x,y)dxdy[/math]
2.[math]{E}(L_{min})=\int_0^{1/n}x[/math] 的 [math]{\rm dP}(L_min \le x)=F'(x)=f(x)[/math]
3.[math]\int_a^b x \rm df(x)=xf(x)|_a^b - \int_a^bf(x) \rm dx[/math],也就是分部积分

 

写个程序可以很快求出近似解

package tests.expectationofropelength;
 
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
 
/**
 * 一根长1米的绳子 随机分3段 求最长那一段和最短那一段的期望值
 */
public class ExpectationOfRopeLength
{
	public static class Rope
	{
		private int N;
		/**
		 * 如果绳子一端为原点,存的是x值
		 */
		private List pieces;
		/**
		 * 存的是绳子段的长度
		 */
		private List pieces2;
		/**
		 * 保存每次随机出的长的那段
		 */
		private List longs;
		/**
		 * 保存每次随机出的短的那段
		 */
		private List shorts;
 
		private Random r;
 
		public Rope(int n)
		{
			N = n;
			pieces = new ArrayList();
			pieces2 = new ArrayList();
			longs = new ArrayList();
			shorts = new ArrayList();
			r = new Random(2L);
		}
 
		public void xPartsRandom(int x)
		{
			pieces.clear();
			pieces2.clear();
			for (int i = 1; i &lt; x; i++)
			{
				pieces.add(Math.abs(r.nextInt() % N));
			}
			pieces.add(N);
		}
 
		public void longsAndShorts()
		{
			Collections.sort(pieces);
			pieces2.add(pieces.get(0));
			for (int i = 1; i &lt; pieces.size(); i++)
			{
				pieces2.add((pieces.get(i) - pieces.get(i - 1)));
			}
			Collections.sort(pieces2);
			longs.add(pieces2.get(pieces2.size() - 1));
			shorts.add(pieces2.get(0));
		}
 
		public String toString()
		{
			return
			/* pieces.toString() + "\n" + pieces2.toString() + "\n" + */shorts
					.toString() + "\n" + longs.toString();
		}
 
		private long sum(List lst)
		{
			long sum = 0;
			for (int i = 0; i &lt; lst.size(); i++)
			{
				sum += lst.get(i);
			}
			return sum;
		}
 
		public double getAverageLong()
		{
			return sum(longs) / (double) longs.size();
		}
 
		public double getAverageShort()
		{
			return sum(shorts) / (double) shorts.size();
		}
	}
 
	public static void main(String[] args)
	{
		int M = 1000;
		int x = 3;
		Rope rope = new Rope(1000);
		for (int i = 0; i &lt; M; i++)
		{
			rope.xPartsRandom(x);
			rope.longsAndShorts();
		}
		System.out.println(rope);
		System.out.println("short avg:" + rope.getAverageShort());
		System.out.println("long avg:" + rope.getAverageLong());
	}
}




xiaolizi
转载须以超链接形式标明文章原始出处和作者信息
0

【数学】证明素数无穷多的两种个方法

方法一
反证法。假设存在最大素数p,那么令q=2*3*5*...*p+1,则q不能被<=p的所有素数整除,所以q为素数所以假设不成立。

方法二
易证明相邻两个数n,n+1互质。则n(n+1)至少含有2个不同的质因子。同理n(n+1),n(n+1)+1也互质,n(n+1)*(n(n+1)+1)至少有3个不同质因子。以此类推,这样的相邻的数可以一直写下去,其不同的质因子的也在随之增多,没有上限。


xiaolizi
转载须以超链接形式标明文章原始出处和作者信息