Hi guyz,
in opencfr, there is a portion of the code that I don't undersant: When the algo is a a terminal node and EV has been calculated, how EV is back propagated to the previous node ????
any idea ?
It's seems that the EV are back propagated via the function parameter EV but that's possible... It is ?
Following the upgrade_regret function :
Code:
static void update_regret(leduc::sequence u, int buckets[2][2], int hole[2], int board,
int result, double reach[2], double chance, double ev[2], double cfr[2],
regret_strategy strat[2]) {
if (u.is_terminal()) {
/* sequence is terminal */
int amount = u.win_amount();
if (u.is_fold()) {
if (u.who_folded() == 0) {
ev[0] = -amount*reach[1]*chance;
ev[1] = amount*reach[0]*chance;
} else {
ev[0] = amount*reach[1]*chance;
ev[1] = -amount*reach[0]*chance;
}
} else {
/* sequence is a showdown */
ev[0] = result*reach[1]*amount*chance;
ev[1] = -result*reach[0]*amount*chance;
}
} else if (reach[0] < EPSILON && reach[1] < EPSILON) {
/* cutoff, do nothing */
ev[0] = ev[1] = 0;
} else {
/* some convience variables */
int player = u.whose_turn();
int opponent = leduc::opposite_player(player);
int round = u.get_round();
/* player is using regret minimizing strategy */
//Get probability for the 3 possible actions
double * average_probability = strat[player].get_average_probability(u, buckets[player][round]);
double * regret = strat[player].get_regret(u, buckets[player][round]);
/* get the probabilty tuple for each player */
double probability[3];
strat[player].get_probability(u, buckets[player][round], probability);
/* first average the strategy for the player */
for(int i=0; i<3; ++i) {
average_probability[i] += reach[player]*probability[i];
}
/* now compute the regret on each of our actions */
double expected = 0, sum = 0;
double old_reach = reach[player];
double delta_regret[3];
for(int i=0; i<3; ++i) {
if (u.can_do_action(i)) {
reach[player] = old_reach*probability[i];
update_regret(u.do_action(i), buckets, hole, board, result, reach, chance, ev, cfr, strat);
delta_regret[i] = ev[player];
//compute EV of the strategy = current expected value
expected += ev[player]*probability[i];
sum += ev[opponent];
}
}
/* restore reachability value */
reach[player] = old_reach;
/* subtract off expectation to get regret for each action*/
for(int i=0; i<3; ++i) {
if (u.can_do_action(i)) {
delta_regret[i] -= expected;
//regret for each action
regret[i] += delta_regret[i];
cfr[player] += max(0., delta_regret[i]);
}
}
/* set return value */
ev[player] = expected;
ev[opponent] = sum;
}
}