import {choleskyDecomposition} from "@sw1227/cholesky-decomposition"
import {boxMuller} from "@sw1227/box-muller-transform"
math = require("mathjs")
// Mean of an array
mean = array => (array.reduce((a, b) => a + b) / array.length);
// Round to specified digits
g = (x, digits = 3) => x.toFixed(digits);
// Sum positive values in an array
sumPositives = (arr = []) => {
const isPositive = num => typeof num === 'number' && num > 0;
const res = arr.reduce((acc, val) => {
if(isPositive(val)){
acc += 1;
};
return acc;
}, 0);
return res;
};
// Linear regression
linearRegression = (y,x) => {
var lr = {};
var n = y.length;
var sum_x = 0;
var sum_y = 0;
var sum_xy = 0;
var sum_xx = 0;
var sum_yy = 0;
for (var i = 0; i < y.length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += (x[i]*y[i]);
sum_xx += (x[i]*x[i]);
sum_yy += (y[i]*y[i]);
}
lr['slope'] = (n * sum_xy - sum_x * sum_y) / (n*sum_xx - sum_x * sum_x);
lr['intercept'] = (sum_y - lr.slope * sum_x) / n;
lr['r2'] = Math.pow((n*sum_xy - sum_x*sum_y) / Math.sqrt((n*sum_xx-sum_x*sum_x)*(n*sum_yy-sum_y*sum_y)),2);
return lr;
}
multivariateNormal = (mean, covArray) => {
const n = mean.length;
const cov = math.matrix(covArray);
return {
// Probability Density Function
pdf: x => {
const c = 1 / (math.sqrt(2*math.PI)**n * math.sqrt(math.det(cov)));
return c * math.exp(
-(1/2) * math.multiply(
math.subtract(math.matrix(x), math.matrix(mean)),
math.inv(cov),
math.subtract(math.matrix(x), math.matrix(mean))
)
);
},
// Differential entropy
entropy: 0.5*math.log(math.det(cov)) + 0.5*n*(1 + math.log(2*math.PI)),
// Generate n samples using Cholesky Decomposition
sample: n_samples => Array(n_samples).fill().map(_ => {
const L = choleskyDecomposition(cov);
const z = boxMuller(n);
return math.add(
math.matrix(mean),
math.multiply(cov, math.matrix(z))
).toArray();
}),
};
}
pcorr = (x, y) => {
let sumX = 0,
sumY = 0,
sumXY = 0,
sumX2 = 0,
sumY2 = 0;
const minLength = x.length = y.length = Math.min(x.length, y.length),
reduce = (xi, idx) => {
const yi = y[idx];
sumX += xi;
sumY += yi;
sumXY += xi * yi;
sumX2 += xi * xi;
sumY2 += yi * yi;
}
x.forEach(reduce);
return (minLength * sumXY - sumX * sumY) / Math.sqrt((minLength * sumX2 - sumX * sumX) * (minLength * sumY2 - sumY * sumY));
};
icsim = (trials, assets, rho, mu1, mu2, s1, s2) => {
const terciles = 10;
const Sigma = math.matrix([[s1 * s1, s1 * s2 * rho], [s1 * s2 * rho, s2 * s2]]);
const spread = Array(assets).fill(0);
const longSpread = Array(assets).fill(0);
const shortSpread = Array(assets).fill(0);
const icPearson = Array(assets).fill(0);
const rsq = Array(assets).fill(0);
const r = Array(assets).fill(0);
const coef = Array(assets).fill(0);
const coefTest = Array(assets).fill(0);
for (let i = 0; i < trials; i++) {
const norm = multivariateNormal([mu1, mu2], Sigma);
const sims = norm.sample(assets);
// Convert array to objects with keys 'returns' and 'exposures'
const simsData = sims.map(row => ({ returns: row[0], exposures: row[1] }));
// Rank exposures into terciles
const rankedSims = _.orderBy(simsData, ['exposures'], ['asc']);
rankedSims.forEach((item, index) => item.rank = Math.ceil((index + 1) / assets * terciles));
// Aggregate by rank
const decile = _.chain(rankedSims)
.groupBy('rank')
.map((value, key) => ({
rank: parseInt(key),
avgReturn: _.meanBy(value, 'returns')
}))
.orderBy('rank', 'asc')
.value();
const univ = _.meanBy(decile, 'avgReturn');
const decileMap = _.keyBy(decile, 'rank');
spread[i] = decileMap[terciles].avgReturn - decileMap[1].avgReturn;
longSpread[i] = decileMap[terciles].avgReturn - univ;
shortSpread[i] = univ - decileMap[1].avgReturn;
icPearson[i] = pcorr(rankedSims.map(item => item.returns), rankedSims.map(item => item.exposures));
// Linear regression
var tc = toColumns(rankedSims);
var lr = linearRegression(tc.returns, tc.exposures)
rsq[i] = lr.r2;
r[i] = Math.sqrt(lr.r2);
coef[i] = lr.slope;
coefTest[i] = math.std(rankedSims.map(item => item.returns)) / math.std(rankedSims.map(item => item.exposures)) * icPearson[i];
}
var out = {};
out['Trials'] = trials;
out['Assets'] = assets;
out['Ret. Vol.'] = s1;
out['Exp. Vol'] = s2;
out['Correlation'] = rho.toFixed(2);
out['Spread'] = g(mean(spread)*10000, 1);
<!-- out['Long_Spread'] = g(mean(longSpread)*10000); -->
<!-- out['Short_Spread'] = g(mean(shortSpread)*10000); -->
out['Pct_Positive'] = g((sumPositives(spread) / trials), 2);
out['IC'] = g(mean(icPearson));
out['R_sq'] = g(mean(rsq));
out['R'] = g(mean(r));
out['Coef'] = g(mean(coef));
out['Scaled_IC'] = g(mean(coefTest));
return out;
}
toColumns = rawdata => {
// Initialize columns
const columns = {};
// Get keys from first row (assumes all rows have the same keys)
const keys = Object.keys(rawdata[0]);
// Initialize empty arrays for each key
keys.forEach(key => {
columns[key] = [];
});
// Populate columns
rawdata.forEach(row => {
keys.forEach(key => {
columns[key].push(row[key]);
});
});
return columns;
}
Information Coefficients & Linear Regression
Motivation
In this paper we use Monte Carlo simulation to show the relationship between the Information Coefficient (IC), correlation, decile returns, and linear regression.1 We can also gain insights into investment related questions, such as
- What level of IC is considered good?
- What effect does volatility have on the spread?
- What effect does universe size have on the hit rate?
We will start off by defining some terms to make sure that we are all on the same page.
- The Spread is the difference between the average return of the top decile and the average return of the bottom decile.
- The IC is the correlation between two series, here the return series and the exposure series. The Pearson correlation is used in this JavaScript implementation, but most times Spearman’s Rank correlation is used because it is less effected by outliers.
- A linear model is the regression of one series versus another resulting in an intercept and a coefficients which describes the relationship between the two variables.
- R-squared measures the goodness of fit of the linear model. It is also referred to as the coefficient of determination.
Simulation Process
We proceed by assuming asset returns are normally distributed and we generate two random series from a bivariate normal distribution with given mean and correlation. We will call the first random series the returns and call the second random series the exposures. The exposures represent the factor, which could be momentum, book to price, or any other factor.
As defined above, the correlation between the two random series is the IC. Here is and outline of how each simulation is performed:
- Divide into deciles based on exposures
- Calculate spread between top decile and bottom decile
- Calculate the IC
- Run a linear regression
Base case
To make this presentation interaction we use a JavaScript implementation of a multivariate random number generator to produce two series for each simulation, one we will call the returns and the other the exposures. Both series have zero mean. They only thing they have in common is a correlation, which we vary from zero to twenty percent. Before going further into the details, let’s have a look at some simulations so we can describe how everything relates.
You can see from the table above that we run simulations, each with assets. By assets we mean how long each random series is and by simulations we mean how many random samples we draw. The values shown in the table are averages of all the simulations. The first column shows the correlation, which represents the information. The next two columns show the volatility of the returns series and the exposures series, which we have set to 8% for now, but will very later.
The Spread shows one measure of the performance of the factor. The exposures series is our factor and we create deciles based on that series. Once we have the deciles, we average the returns for each decile. The spread is the difference between the top decile average return and the bottom decile average return. The percent positive column shows what proportion of the spreads are positive. Each simulation produces one spread, so since we have simulations, we have spreads. The average of these spreads is the spread column and the proportion positive is the percent positive column. The IC is the average correlation between the exposures and returns series and is another standard measure of the performance of a factor.
Measuring the correlation between two factors is a quick and easy way to see how closely they are related, and how powerful the exposures may be in predicting returns. Another way to do so is to run a regression of the exposures on the returns. The regression function provides three main outputs, the y-intercept, the coefficient, and the R-squared measure. The R-squared is a measure of the goodness of fit of the regression equation. The square root of the R-squared statistic returns the IC (to a close approximation). In this simple one-variable linear regression framework the coefficient is equivalent to the correlation that was introduced to the two random series. Lastly, the coefficient can be approximated by the scaled IC as defined in the formula below:
\[ Scaled.IC = \frac{\sigma_{returns}}{\sigma_{exposures}} * IC \]
In this table both the returns and exposures have the same standard deviation, so the volatility ratio is one, so it doesn’t seem very informative, but in the next table we will run simulations with a higher returns volatility and you will see the formula holds.
Remember that these are random series with zero mean, so the only information content is the correlation. By running enough simulations we can reliably approximate the true correlation by both the IC and the R (the square root of the regression R-squared statistic). We can also approximate the coefficient by calculating the scaled IC.
Correlation, IC, and hit rate
As you can see from the base case simulation run, the correlation and the IC are closely linked. In these simulations the correlation is the given relationship between returns and exposures and the IC is a measurement of how well our signal (or rank in this case) works.2
The percent positive (Pct.Positive) column shows the number of simulation runs (out of r trials
) that the spread was positive. This number is often called the hit rate. If the factor has a zero correlation we would expect a hit rate of 0.5 (or 50%). A correlation of one percent bumps the hit rate up to almost 60% and two percent gets us to 68%. Usually ICs in the range of 5% to 10% are considered very good. The hit rate in that case would be between 89% and 99%, which is very good indeed. Note that this is on a large universe of 1000 assets. We will see later that the hit rate declines rapidly as the number of assets falls.
Higher Return Volatility
In the next set of simulations we increase the return volatility from 8% to 16%.
Doubling the returns volatility basically doubles the spread. The IC and the regression R (the square root of R-squared) still match the actual correlation between the two random series, but the coefficient is twice as large as in the previous table. This is because the returns volatility is now twice as high as the exposures volatility. The regression coefficient is accurately estimated by the scaled IC measure.
Lower Volatility
In the next table we simulate both series with only 2% volatility (for both returns and exposures). For a given correlation, the percent positive, IC, and coefficients are all comparable to the base case, but the spread is much lower. The returns volatility is what creates the opportunity to profit and if the correlation is high enough the investor can capitalize on the hit rate (percent positive).
All the usual relationships hold. The only difference is that the spread is much lower due to the lower volatility of the returns series.
Higher Exposure Volatility
When the exposures volatility is twice as high as the returns volatility the spread remains comparable to the base case, but the scaled IC drops in half, as does the regression coefficient. Clearly, we want the volatility to be on the returns and not on the exposures.
Non-zero Mean
While generally a higher return is better, in this case we are measuring the spread, which is the difference between the average return in the top decile minus the average return in the bottom decile. So a higher return just increases the average return, but does not (necessarily) benefit the top decile more than the bottom decile, leaving the spread pretty much the same. As you can see, all the other metrics are comparable as well. The correlation and volatility are the two driving forces of factor performance.
Small Number of Assets
As mentioned earlier, when we lower the number of assets to the hit rate (Pct.Positive) falls substantially. The rest of the metrics are similar to the base case. Quantitative investing is a numbers game, where it pays to have as much breadth as possible. The hit rates decline with a smaller number of assets, Although the hit rates are still okay, you have to make numerous “bets” to get those advertised numbers. If you are only investing in a small number of assets, you could get a resulting hit rate that is much worse (or better) than the advertised hit rate.
Conclusions
This interactive presentation explained the relationship between the Information Coefficient (IC) and linear regression model output. These simulations should help you understand the importantance of correlation in factor investing and what level of IC and/or correlation will yield acceptable results. We have shown that the larger the universe the better the expected hit rate (for a given correlation). We have also shown that returns volatility is good and the exposures volatility is (relatively) bad. Finally, average returns are not as important as the spread between the top and bottom deciles.
Footnotes
This paper was motivated by a short paper and slides by Oliver Buckley at Invesco which can be found here.↩︎
Since this is a simple one-factor (exposure) model, the IC matches the correlation very close (we are able to capture the full signal). In a multi-factor context the IC will usually be lower than the correlation because the factors will not be perfectly uncorrelated.↩︎