OpenMPCD
OnTheFlyStatistics.py
1 # coding=utf-8
2 
3 import math
4 
5 class OnTheFlyStatistics:
6  """
7  Computes the sample mean and variance of provided data incrementally.
8 
9  The algorithm used is described in the "updating formulas" (1.3) in the paper
10  "Algorithms for Computing the Sample Variance: Analysis and Recommendations"
11  by Tony F. Chan, Gene H. Golub, and Randall J. LeVeque,
12  The American Statistician, August 1983, Vol. 37, No. 3, pp. 242-247
13  DOI: 10.2307/2683386
14 
15  See also "Formulas for Robust, One-Pass Parallel Computation of Covariances
16  and Arbitrary-Order Statistical Moments" by Philippe Pébay, Sandia Report
17  SAND2008-6212, 2008.
18  """
19 
20  def __init__(self, mean = None, sampleVariance = None, sampleSize = None):
21  """
22  The constructor.
23 
24  The parameters `mean`, `sampleVariance`, and `sampleSize` can either be
25  `None`, in which case an instance without any data points is
26  constructed, or all three parameters can be specified, in which case
27  the instance is constructed as if data were given that result in the
28  specified parameters.
29  """
30 
31  parameters = [mean, sampleVariance, sampleSize]
32 
33  if mean is None:
34  for parameter in parameters:
35  if parameter is not None:
36  raise Exception()
37 
38  self.n = 0
39  self.M = 0.0
40  self.S = 0.0
41 
42  return
43 
44  for parameter in parameters:
45  if parameter is None:
46  raise Exception()
47 
48  self.n = sampleSize
49  self.M = mean
50  self.S = sampleVariance * (self.n - 1)
51 
52 
53  def addDatum(self, datum):
54  """
55  Adds a datum to the sample.
56  """
57 
58  delta = datum - self.M
59 
60  self.n = self.n + 1
61  self.M = self.M + delta / self.n
62  self.S = self.S + delta * (datum - self.M)
63 
64 
65  def mergeSample(self, sample):
66  """
67  Merges the given sample with this one.
68 
69  This assumes that both samples are drawn from the same population.
70  """
71 
72  if self.n == 0:
73  self.n = sample.n
74  self.M = sample.M
75  self.S = sample.S
76  return
77 
78  if self.n == 1:
79  myValue = self.M
80 
81  self.n = sample.n
82  self.M = sample.M
83  self.S = sample.S
84 
85  self.addDatum(myValue)
86  return
87 
88  if sample.n == 0:
89  return
90 
91  if sample.n == 1:
92  self.addDatum(sample.getSampleMean())
93  return
94 
95  mergedSampleSize = 0
96  for s in [self, sample]:
97  mergedSampleSize += s.getSampleSize()
98 
99  mergedMean = 0
100  for s in [self, sample]:
101  mergedMean += s.getSampleSize() * s.getSampleMean()
102  mergedMean /= mergedSampleSize
103 
104  mergedS = 0
105  for s in [self, sample]:
106  mergedS += (s.getSampleSize() - 1) * s.getSampleVariance()
107 
108  n = self.getSampleSize()
109  m = sample.getSampleSize()
110  c = (self.getSampleMean() - sample.getSampleMean())
111  c /= n + m
112  c **= 2
113  c *= n * m * (n + m)
114  mergedS += c
115 
116  self.n = mergedSampleSize
117  self.M = mergedMean
118  self.S = mergedS
119 
120 
121  def mergeSamples(self, samples):
122  """
123  Merges the given samples with this one.
124 
125  This assumes that all samples are drawn from the same population.
126  """
127 
128  for sample in samples:
129  self.mergeSample(sample)
130 
131 
132  def getSampleSize(self):
133  """
134  Returns the number of data points added so far.
135  """
136 
137  return self.n
138 
139  def getSampleMean(self):
140  """
141  Returns the mean of all the values added so far.
142 
143  If no values have been added so far, an exception is thrown.
144  """
145 
146  if self.n == 0:
147  raise ValueError("Tried to get mean without having supplied data.")
148 
149  return self.M
150 
151  def getSampleVariance(self):
152  """
153  Returns the unbiased sample variance of all the values added so far.
154 
155  The returned value contains Bessel's correction, i.e. the sum of
156  squares of differences is divided by \f$ n - 1 \f$ rather than
157  \f$ n \f$, where \f$ n \f$ is the sample size.
158 
159  If fewer than two values have been added so far, an exception is thrown.
160  """
161 
162  if self.n <= 1:
163  raise ValueError("Tried to get variance without having supplied enough data.")
164 
165  return self.S / (self.n - 1)
166 
167  def getSampleStandardDeviation(self):
168  """
169  Returns the unbiased sample standard deviation of all the values added
170  so far.
171 
172  The returned value contains Bessel's correction, i.e. the sum of
173  squares of differences is divided by \f$ n - 1 \f$ rather than
174  \f$ n \f$, where \f$ n \f$ is the sample size.
175 
176  If fewer than two values have been added so far, an exception is thrown.
177  """
178 
179  return math.sqrt(self.getSampleVariance())
180 
181 
182  def getStandardErrorOfTheMean(self):
183  """
184  Returns the standard error of the mean, i.e. the unbiased sample
185  standard deviation divided by the square root of the sample size.
186  """
187 
188  return \
189  self.getSampleStandardDeviation() / math.sqrt(self.getSampleSize())
190 
191 
192  def serializeToString(self):
193  """
194  Returns a `str` that contains the state of this instance.
195 
196  @see unserializeFromString
197  """
198 
199  ret = ""
200 
201  ret += "1;" #format version
202  ret += str(self.n) + ";"
203  ret += str(self.M) + ";"
204  ret += str(self.S)
205 
206  return ret
207 
208 
209  def unserializeFromString(self, state):
210  """
211  Discards the current state, and loads the state specified in the given
212  string instead.
213 
214  @throw TypeError
215  Throws if `state` is of the wrong type.
216  @throw ValueError
217  Throws if `state` does not encode a valid state.
218 
219  @param[in] state
220  The state to load. Must be a `str` created by
221  `serializeToString`.
222  """
223 
224  if not isinstance(state, str):
225  raise TypeError()
226 
227  parts = state.split(";")
228  if int(parts[0]) != 1:
229  raise ValueError()
230 
231  if len(parts) != 4:
232  raise ValueError()
233 
234  n = int(parts[1])
235  M = float(parts[2])
236  S = float(parts[3])
237 
238  if n < 0:
239  raise ValueError()
240  if n == 0:
241  if M != 0 or S != 0:
242  raise ValueError()
243  if S < 0:
244  raise ValueError()
245 
246  self.n = n
247  self.M = M
248  self.S = S
249 
250 
251  def __eq__(self, rhs):
252  """
253  The equality operator.
254 
255  Returns whether the state of this object is the same as the state of the
256  `rhs` object; if `rhs` is not of this instance's type, `NotImplemented`
257  is returned.
258 
259  @param[in] rhs The right-hand-side instance to compare to.
260  """
261 
262  if not isinstance(rhs, self.__class__):
263  return NotImplemented
264 
265  return self.__dict__ == rhs.__dict__
266 
267 
268  def __ne__(self, rhs):
269  """
270  The inequality operator.
271 
272  Returns whether the state of this object differs from the state of the
273  `rhs` object; if `rhs` is not of this instance's type, `NotImplemented`
274  is returned.
275 
276  @param[in] rhs The right-hand-side instance to compare to.
277  """
278 
279  if not isinstance(rhs, self.__class__):
280  return NotImplemented
281 
282  return not self == rhs
283 
284 
285  def __hash__(self):
286  """
287  Returns a hash of this object that depends on its state, and nothing
288  more.
289  """
290 
291  return hash(tuple(sorted(self.__dict__.items())))
292 
293 
294  def __repr__(self):
295  """
296  Returns a `str` describing the data of this instance.
297  """
298 
299  ret = "OnTheFlyStatistics: {"
300  ret += "Size: " + str(self.getSampleSize())
301  if self.getSampleSize() >= 1:
302  ret += ", Mean: " + str(self.getSampleMean())
303  if self.getSampleSize() >= 2:
304  ret += ", Standard Deviation: "
305  ret += str(self.getSampleStandardDeviation())
306  ret += ", Variance: " + str(self.getSampleVariance())
307  ret += "}\n"
308  return ret
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.unserializeFromString
def unserializeFromString(self, state)
Definition: OnTheFlyStatistics.py:234
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.getSampleMean
def getSampleMean(self)
Definition: OnTheFlyStatistics.py:151
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.__ne__
def __ne__(self, rhs)
Definition: OnTheFlyStatistics.py:291
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.getSampleVariance
def getSampleVariance(self)
Definition: OnTheFlyStatistics.py:168
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.M
M
Definition: OnTheFlyStatistics.py:41
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.mergeSample
def mergeSample(self, sample)
Definition: OnTheFlyStatistics.py:74
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.getSampleStandardDeviation
def getSampleStandardDeviation(self)
Definition: OnTheFlyStatistics.py:186
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.getStandardErrorOfTheMean
def getStandardErrorOfTheMean(self)
Definition: OnTheFlyStatistics.py:196
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.n
n
Definition: OnTheFlyStatistics.py:40
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.S
S
Definition: OnTheFlyStatistics.py:42
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.mergeSamples
def mergeSamples(self, samples)
Definition: OnTheFlyStatistics.py:131
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.serializeToString
def serializeToString(self)
Definition: OnTheFlyStatistics.py:208
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.__dict__
__dict__
Definition: OnTheFlyStatistics.py:278
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.__init__
def __init__(self, mean=None, sampleVariance=None, sampleSize=None)
Definition: OnTheFlyStatistics.py:31
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.__eq__
def __eq__(self, rhs)
Definition: OnTheFlyStatistics.py:273
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.__hash__
def __hash__(self)
Definition: OnTheFlyStatistics.py:304
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.__repr__
def __repr__(self)
Definition: OnTheFlyStatistics.py:313
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.getSampleSize
def getSampleSize(self)
Definition: OnTheFlyStatistics.py:141
MPCDAnalysis.OnTheFlyStatistics.OnTheFlyStatistics.addDatum
def addDatum(self, datum)
Definition: OnTheFlyStatistics.py:59