Source code for icarus.tools.stats
"""Provides statistical utilities functions used by the simulator
"""
from __future__ import division
import math
import random
import collections
import numpy as np
import scipy.stats as ss
__all__ = [
'DiscreteDist',
'TruncatedZipfDist',
'means_confidence_interval',
'proportions_confidence_interval',
'cdf',
'pdf',
]
[docs]class DiscreteDist(object):
"""Implements a discrete distribution with finite population.
The support must be a finite discrete set of contiguous integers
{1, ..., N}. This definition of discrete distribution.
"""
def __init__(self, pdf, seed=None):
"""
Constructor
Parameters
----------
pdf : array-like
The probability density function
seed : any hashable type (optional)
The seed to be used for random number generation
"""
if np.abs(sum(pdf) - 1.0) > 0.001:
raise ValueError('The sum of pdf values must be equal to 1')
random.seed(seed)
self._pdf = np.asarray(pdf)
self._cdf = np.cumsum(self._pdf)
# set last element of the CDF to 1.0 to avoid rounding errors
self._cdf[-1] = 1.0
def __len__(self):
"""Return the cardinality of the support
Returns
-------
len : int
The cardinality of the support
"""
return len(self._pdf)
@property
def pdf(self):
"""
Return the Probability Density Function (PDF)
Returns
-------
pdf : Numpy array
Array representing the probability density function of the
distribution
"""
return self._pdf
@property
def cdf(self):
"""
Return the Cumulative Density Function (CDF)
Returns
-------
cdf : Numpy array
Array representing cdf
"""
return self._cdf
[docs] def rv(self):
"""Get rand value from the distribution
"""
rv = random.random()
# This operation performs binary search over the CDF to return the
# random value. Worst case time complexity is O(log2(n))
return int(np.searchsorted(self._cdf, rv) + 1)
[docs]class TruncatedZipfDist(DiscreteDist):
"""Implements a truncated Zipf distribution, i.e. a Zipf distribution with
a finite population, which can hence take values of alpha > 0.
"""
def __init__(self, alpha=1.0, n=1000, seed=None):
"""Constructor
Parameters
----------
alpha : float
The value of the alpha parameter (it must be positive)
n : int
The size of population
seed : any hashable type, optional
The seed to be used for random number generation
"""
# Validate parameters
if alpha <= 0:
raise ValueError('alpha must be positive')
if n < 0:
raise ValueError('n must be positive')
# This is the PDF i. e. the array that contains the probability that
# content i + 1 is picked
pdf = np.arange(1.0, n + 1.0) ** -alpha
pdf /= np.sum(pdf)
self._alpha = alpha
super(TruncatedZipfDist, self).__init__(pdf, seed)
@property
def alpha(self):
return self._alpha
[docs]def means_confidence_interval(data, confidence=0.95):
"""Computes the confidence interval for a given set of means.
Parameters
----------
data : array-like
The set of samples whose confidence interval is calculated
confidence : float, optional
The confidence level. It must be a value in the interval (0, 1)
Returns
-------
mean : float
The mean of the sample
err : float
The standard error of the sample
References
----------
[1] N. Matloff, From Algorithms to Z-Scores: Probabilistic and Statistical
Modeling in Computer Science.
Available: http://heather.cs.ucdavis.edu/probstatbook
"""
if confidence <= 0 or confidence >= 1:
raise ValueError('The confidence parameter must be greater than 0 and '
'smaller than 1')
n = len(data)
w = np.mean(data)
s = np.std(data)
err = ss.norm.interval(confidence)[1]
return w, err * s / math.sqrt(n)
[docs]def proportions_confidence_interval(data, confidence):
"""Computes the confidence interval of a proportion.
Parameters
----------
data : array-like of bool
The sample of data whose proportion of True values needs to be
estimated
confidence : float, optional
The confidence level. It must be a value in the interval (0, 1)
References
----------
[1] N. Matloff, From Algorithms to Z-Scores: Probabilistic and Statistical
Modeling in Computer Science.
Available: http://heather.cs.ucdavis.edu/probstatbook
"""
if confidence <= 0 or confidence >= 1:
raise ValueError('The confidence parameter must be greater than 0 and '
'smaller than 1')
n = float(len(data))
m = len((i for i in data if i is True))
p = m / n
err = ss.norm.interval(confidence)[1]
return p, err * math.sqrt(p * (1 - p) / n)
[docs]def cdf(data):
"""Return the empirical CDF of a set of 1D data
Parameters
----------
data : array-like
Array of data
Returns
-------
x : array
All occurrences of data sorted
cdf : array
The CDF of data.
More specifically cdf[i] is the probability that x < x[i]
"""
if len(data) < 1:
raise TypeError("data must have at least one element")
freq_dict = collections.Counter(data)
sorted_unique_data = np.sort(list(freq_dict.keys()))
freqs = np.zeros(len(sorted_unique_data))
for i in range(len(freqs)):
freqs[i] = freq_dict[sorted_unique_data[i]]
# freqs = np.array([freq_dict[sorted_unique_data[i]]
# for i in range(len(sorted_unique_data))])
cdf = np.array(np.cumsum(freqs))
norm = cdf[-1]
cdf = cdf / norm # normalize
cdf[-1] = 1.0 # Prevent rounding errors
return sorted_unique_data, cdf
[docs]def pdf(data, n_bins):
"""Return the empirical PDF of a set of 1D data
Parameters
----------
data : array-like
Array of data
n_bins : int
The number of bins
Returns
x : array
The center point of all bins
pdf : array
The PDF of data.
"""
# Validate input parameters
if len(data) < 1:
raise TypeError("data must have at least one element")
if not isinstance(n_bins, int):
raise TypeError("intervals parameter must be an integer")
if n_bins < 1:
raise TypeError("Intervals must be >= 1")
# Sort data and divide it in sections
data = np.sort(data)
data_min = data[0]
data_max = data[-1]
boundaries = np.linspace(data_min, data_max, n_bins + 1)
x = boundaries[:-1] + ((boundaries[1] - boundaries[0]) / 2.0)
# Count number of samples in each section
pdf = np.zeros(n_bins)
section = 0
for entry in data:
if entry <= boundaries[section + 1]:
pdf[section] += 1
else:
section += 1
while entry > boundaries[section + 1]:
section += 1
pdf[section] += 1
# Normalize pdf
pdf = (pdf * n_bins) / (np.sum(pdf) * (data_max - data_min))
return x, pdf