From the paper Efficient Monte Carlo Counterfactual Regret Minimization in Games with Many Player Actions.
http://poker.cs.ualberta.ca/publications/NIPS12.pdfIn the same format as Amax's posts:
Code:
public override double TrainAverageStrategySampling(int trainplayer, Iteration iteration, double q)
{
int hole = iteration.GetBucket(street, player);
// You must also devide the utility by Q at terminal nodes (showdown and fold)
const double e = 0.5;
const double t = 1000;
const double b = 100000;
var s = GetStrategy(hole);
if (player == trainplayer)
{
var u = new double[children.Length];
double ev = 0;
double cs_sum = 0;
for (int i = 0; i < children.Length; i++)
cs_sum += cumulativeStrategy[hole, i];
for (int i = 0; i < children.Length; i++)
{
double ap = Math.Max(e, (b + t * cumulativeStrategy[hole, i]) / (b + cs_sum));
if (rnd.Value.NextDouble() < ap)
{
u[i] = children[i].TrainAverageStrategySampling(trainplayer, iteration, q * Math.Min(1, ap));
ev += u[i] * s[i];
}
}
for (int i = 0; i < children.Length; i++)
regret[hole, i] += u[i] - ev;
return ev;
}
else
{
for (int i = 0; i < children.Length; i++)
cumulativeStrategy[hole, i] += s[i] / q;
int a = SampleStrategy(s);
return children[a].TrainAverageStrategySampling(trainplayer, iteration, q);
}
}
Or, Average Strategy Sampling as a probing strategy.
http://poker.cs.ualberta.ca/publications/AAAI12-generalmccfr.pdfCode:
public override double TrainAverageStrategyProbing(int trainplayer, Iteration iteration, double q, bool probe)
{
int hole = iteration.GetBucket(street, player);
const double e = 0.5;
var s = GetStrategy(hole);
if (probe)
{
int a = SampleStrategy(s);
return children[a].TrainAverageStrategyProbing(trainplayer, iteration, q, true);
}
else if (player == trainplayer)
{
var u = new double[children.Length];
double ev = 0;
double cs_sum = 0;
for (int i = 0; i < children.Length; i++)
cs_sum += cumulativeStrategy[hole, i];
for (int i = 0; i < children.Length; i++)
{
double ap = cs_sum <= 0 ? 1 : Math.Max(e, cumulativeStrategy[hole, i]) / cs_sum);
if (rnd.Value.NextDouble() <= ap)
{
u[i] = children[i].TrainAverageStrategyProbing(trainplayer, iteration, q / ap, probe);
}
else
{
u[i] = children[i].TrainAverageStrategyProbing(trainplayer, iteration, q, true);
}
ev += u[i] * s[i];
}
for (int i = 0; i < children.Length; i++)
regret[hole, i] += (u[i] - ev) * q;
return ev;
}
else
{
for (int i = 0; i < children.Length; i++)
cumulativeStrategy[hole, i] += s[i] * q;
int a = SampleStrategy(s);
return children[a].TrainAverageStrategyProbing(trainplayer, iteration, q, probe);
}
}