OpenMPCD
OnTheFlyStatisticsDDDA.py
1 # coding=utf-8
2 
3 """
4 @package MPCDAnalysis.OnTheFlyStatisticsDDDA
5 Defines the `OnTheFlyStatisticsDDDA` class.
6 """
7 
9  """
10  Computes sample means and their errors for (possibly) serially correlated
11  data.
12 
13  The algorithm used is called "Dynamic Distributable Decorrelation
14  Algorithm" (DDDA), and is described in
15  "Efficient algorithm for “on-the-fly” error analysis of local or distributed
16  serially correlated data"
17  by David R. Kent IV, Richard P. Muller, Amos G. Anderson,
18  William A. Goddard III, and Michael T. Feldmann,
19  Journal of Computational Chemistry,
20  November 2007, Vol. 28, No. 14, pp. 2309-2316,
21  DOI: 10.1002/jcc.20746,
22  which in turn is partly based on the article
23  "Error estimates on averages of correlated data"
24  by H. Flyvbjerg and H. G. Petersen,
25  The Journal of Chemical Physics, July 1989, Vol. 91, No. 1, pp. 461-466
26  DOI: 10.1063/1.457480
27  """
28 
29  def __init__(self):
30  """
31  The constructor.
32  """
33 
34  from .OnTheFlyStatistics import OnTheFlyStatistics
35 
36  self._blocks = [OnTheFlyStatistics()]
37  self._waiting = [None]
38 
39 
40  def addDatum(self, datum):
41  """
42  Adds a datum to the sample.
43 
44  It is assumed that the "time" intervals between subsequently added data
45  are constant; here, "time" may, for example, refer to Molecular Dynamics
46  or Monte Carlo steps.
47  """
48 
49  self._addDatum(datum, 0)
50 
51 
52  def merge(self, rhs):
53  """
54  Merges the information in `rhs` with this instance's.
55 
56  Let \f$ x_1, x_2, \ldots, x_{N_1} \f$ be the data that have been
57  supplied to this instance prior to calling this function, with
58  \f$ N_1 \f$ being the sample size of this instance prior to calling this
59  function. Similarly, let \f$ y_1, y_2, \ldots, y_{N_2} \f$ be the data
60  supplied to `rhs` previously.
61  Then, after this function successfully returns, this instance's state is
62  approximately as if it had started empty, and then the data
63  \f$ x_1, x_2, \ldots, x_{N_1}, y_1, y_2, \ldots, y_{N_2} \f$
64  had been supplied. This is true to a high degree for the state of block
65  `0`; other blocks may have significantly different state due to the way
66  blocks are populated with data points. The implementation is equivalent
67  to the one named `Decorrelation.addition` in Appendix B of
68  @cite Kent2007.
69 
70  The `rhs` instance is left unchanged (unless, of course, it is the same
71  object as this instance, which is allowed).
72 
73  @throw TypeError
74  Throws if `rhs` is not of type `OnTheFlyStatisticsDDDA`.
75 
76  @param[in] rhs
77  The instance to merge into this one. Must be an instance of
78  `OnTheFlyStatisticsDDDA`.
79  """
80 
81  if not isinstance(rhs, OnTheFlyStatisticsDDDA):
82  raise TypeError()
83 
84 
85  from .OnTheFlyStatistics import OnTheFlyStatistics
86  while len(rhs._blocks) > len(self._blocks):
87  self._blocks.append(OnTheFlyStatistics())
88  self._waiting.append(None)
89 
90  for i in range(0, len(self._blocks)):
91  if i >= len(rhs._blocks):
92  break
93  self._blocks[i].mergeSample(rhs._blocks[i])
94 
95  for i in range(0, len(self._blocks)):
96  if i >= len(rhs._blocks):
97  break
98 
99  if rhs._waiting[i] is None:
100  continue
101 
102  if self._waiting[i] is None:
103  import copy
104  self._waiting[i] = copy.deepcopy(rhs._waiting[i])
105  else:
106  mean = (self._waiting[i] + rhs._waiting[i]) / 2.0
107  self._waiting[i] = None
108 
109  if i + 1 == len(self._blocks):
110  self._blocks.append(OnTheFlyStatistics())
111  self._waiting.append(None)
112 
113  self._addDatum(mean, i + 1)
114 
115 
116  def getSampleSize(self):
117  """
118  Returns the number of data points added so far.
119  """
120 
121  return self._blocks[0].getSampleSize()
122 
123 
124  def getSampleMean(self):
125  """
126  Returns the mean of all the values added so far.
127 
128  Since the mean of all values added is returned, and the sample size may
129  not be a power of 2, statistics with different blocking length may
130  not incorporate the same amount of information. This may lead to
131  difficulties when using the error estimates of statistics of different
132  block lengths to estimate the error in the entire, possibly correlated
133  data set, since the statistics of different blocking lengths do not
134  necessarily incorporate the same measurements.
135 
136  If no values have been added so far, `Exception` is raised.
137  """
138 
139  if self.getSampleSize() == 0:
140  raise Exception("Tried to get mean without having supplied data.")
141 
142  return self._blocks[0].getSampleMean()
143 
144 
145  def getMaximumBlockSize(self):
146  """
147  Returns the largest block size for which there is at least one data
148  point.
149 
150  If no values have been added so far, `Exception` is raised.
151  """
152 
153  if self.getSampleSize() == 0:
154  raise Exception()
155 
156  return 2 ** self.getMaximumBlockID()
157 
158 
159  def getMaximumBlockID(self):
160  """
161  Returns the ID of the largest block size created so far.
162  """
163 
164  return len(self._blocks) - 1
165 
166 
167  def hasBlockVariance(self, blockID):
168  """
169  Returns whether the block with the given `blockID` has enough data to
170  compute a sample variance.
171 
172  @param[in] blockID
173  The block ID, which must be an integer in the range
174  [0, `getMaximumBlockID()`].
175  """
176 
177  if not isinstance(blockID, int):
178  raise TypeError()
179 
180  if blockID < 0 or blockID > self.getMaximumBlockID():
181  raise ValueError()
182 
183  return self._blocks[blockID].getSampleSize() >= 2
184 
185 
186  def getBlockVariance(self, blockID):
187  """
188  Returns the sample variance in the block with the given `blockID`.
189 
190  @throw RuntimeError
191  Throws if `not self.hasBlockVariance(blockID)`.
192 
193  @param[in] blockID
194  The block ID, which must be an integer in the range
195  [0, `getMaximumBlockID()`].
196  """
197 
198  if not isinstance(blockID, int):
199  raise TypeError()
200 
201  if blockID < 0 or blockID > self.getMaximumBlockID():
202  raise ValueError()
203 
204  if not self.hasBlockVariance(blockID):
205  raise RuntimeError()
206 
207  return self._blocks[blockID].getSampleVariance()
208 
209 
210  def getBlockStandardDeviation(self, blockID):
211  """
212  Returns the sample standard deviation in the block with the given
213  `blockID`.
214 
215  @throw TypeError
216  Throws if `blockID` is not of type `int`.
217  @throw ValueError
218  Throws if `blockID` is out of range.
219  @throw RuntimeError
220  Throws if `not self.hasBlockVariance(blockID)`.
221 
222  @param[in] blockID
223  The block ID, which must be an integer in the range
224  [0, `getMaximumBlockID()`].
225  """
226 
227  import math
228  return math.sqrt(self.getBlockVariance(blockID))
229 
230 
231  def getSampleStandardDeviation(self):
232  """
233  Returns the raw sample standard deviation, i.e. the sample standard
234  deviation in block `0`.
235 
236  @throw RuntimeError
237  Throws if `not self.hasBlockVariance(0)`.
238  """
239 
240  return self.getBlockStandardDeviation(0)
241 
242 
243  def getBlockStandardErrorOfTheMean(self, blockID):
244  """
245  Returns an estimate for the standard deviation of the standard error of
246  the mean for a given `blockID`.
247 
248  @throw RuntimeError
249  Throws if `not self.hasBlockVariance(blockID)`.
250 
251  @param[in] blockID
252  The block ID, which must be an integer in the range
253  [0, `getMaximumBlockID()`].
254  """
255 
256  if not isinstance(blockID, int):
257  raise TypeError()
258 
259  if blockID < 0 or blockID > self.getMaximumBlockID():
260  raise ValueError()
261 
262  if not self.hasBlockVariance(blockID):
263  raise RuntimeError()
264 
265  return self._blocks[blockID].getStandardErrorOfTheMean()
266 
267 
268 
270  self, blockID):
271  """
272  Returns an estimate for the standard deviation of the standard error of
273  the mean for a given `blockID`.
274 
275  The returned estimate corresponds to Eq. (28) in
276  "Error estimates on averages of correlated data"
277  by H. Flyvbjerg and H. G. Petersen,
278  The Journal of Chemical Physics, July 1989, Vol. 91, No. 1, pp. 461-466
279  DOI: 10.1063/1.457480
280 
281  @throw RuntimeError
282  Throws if `not self.hasBlockVariance(blockID)`.
283 
284  @param[in] blockID
285  The block ID, which must be an integer in the range
286  [0, `getMaximumBlockID()`].
287  """
288 
289  if not isinstance(blockID, int):
290  raise TypeError()
291 
292  if blockID < 0 or blockID > self.getMaximumBlockID():
293  raise ValueError()
294 
295  if not self.hasBlockVariance(blockID):
296  raise RuntimeError()
297 
298  se = self.getBlockStandardErrorOfTheMean(blockID)
299  reducedSampleSize = self._blocks[blockID].getSampleSize()
300 
301  import math
302  return se / math.sqrt(2 * reducedSampleSize)
303 
304 
306  """
307  Returns the block ID corresponding to the optimal block size, in the
308  sense that the corresponding block provides the most accurate estimate
309  for the standard error of the mean.
310 
311  If there is no variance in the data, `0` is returned.
312 
313  The algorithm used is described in Section IV of the article
314  "Strategies for improving the efficiency of quantum Monte Carlo
315  calculations"
316  by R. M. Lee, G. J. Conduit, N. Nemec, P. López Ríos, and
317  N. D. Drummond,
318  Physical Review E, June 2011, Vol. 83, No. 6, pp. 066706
319  DOI: 10.1103/PhysRevE.83.066706
320 
321  @throw RuntimeError
322  Throws if fewer than two data points have been supplied.
323  """
324 
325  if self.getSampleSize() < 2:
326  raise RuntimeError()
327 
328  rawStandardError = \
329  self._blocks[0].getStandardErrorOfTheMean()
330 
331  if rawStandardError == 0:
332  return 0
333 
334  optimalBlockID = self.getMaximumBlockID()
335 
336  for blockID in range(self.getMaximumBlockID(), -1, -1):
337  if not self.hasBlockVariance(blockID):
338  assert blockID == self.getMaximumBlockID()
339 
340  optimalBlockID -= 1
341  continue
342 
343  blockSize = 2 ** blockID
344  blockedStandardError = \
345  self._blocks[blockID].getStandardErrorOfTheMean()
346  quotient = blockedStandardError / rawStandardError
347  threshold = 2 * self.getSampleSize() * quotient ** 4
348  if blockSize ** 3 > threshold:
349  optimalBlockID = blockID
350 
351  return optimalBlockID
352 
353 
355  """
356  Returns whether the sample is large enough for the estimate of the
357  standard error of the mean, as provided by the block indicated by
358  `getOptimalBlockIDForStandardErrorOfTheMean`, to be reliable.
359 
360  The algorithm used is described in Section IV of the article
361  "Strategies for improving the efficiency of quantum Monte Carlo
362  calculations"
363  by R. M. Lee, G. J. Conduit, N. Nemec, P. López Ríos, and
364  N. D. Drummond,
365  Physical Review E, June 2011, Vol. 83, No. 6, pp. 066706
366  DOI: 10.1103/PhysRevE.83.066706
367 
368  @throw RuntimeError
369  Throws if fewer than two data points have been supplied.
370  """
371 
373  blockSize = 2 ** blockID
374 
375  return 50 * blockSize < self.getSampleSize()
376 
377 
379  """
380  Returns the best estimation of the true standard error of the mean of
381  the data, after decorrelation.
382 
383  @see optimalStandardErrorOfTheMeanEstimateIsReliable
384 
385  The algorithm used is described in Section IV of the article
386  "Strategies for improving the efficiency of quantum Monte Carlo
387  calculations"
388  by R. M. Lee, G. J. Conduit, N. Nemec, P. López Ríos, and
389  N. D. Drummond,
390  Physical Review E, June 2011, Vol. 83, No. 6, pp. 066706
391  DOI: 10.1103/PhysRevE.83.066706
392 
393  @throw RuntimeError
394  Throws if fewer than two data points have been supplied.
395  """
396 
398 
399  return self.getBlockStandardErrorOfTheMean(blockID)
400 
401 
402  def serializeToString(self):
403  """
404  Returns a `str` that contains the state of this instance.
405 
406  @see unserializeFromString
407  """
408 
409  ret = ""
410 
411  ret += "1|" #format version
412  ret += str(len(self._blocks))
413  for block in self._blocks:
414  ret += "|" + block.serializeToString()
415  for waiting in self._waiting:
416  ret += "|"
417  if waiting is not None:
418  ret += str(waiting)
419 
420  return ret
421 
422 
423  def unserializeFromString(self, state):
424  """
425  Discards the current state, and loads the state specified in the given
426  string instead.
427 
428  @throw TypeError
429  Throws if `state` is of the wrong type.
430  @throw ValueError
431  Throws if `state` does not encode a valid state.
432 
433  @param[in] state
434  The state to load. Must be a `str` created by
435  `serializeToString`.
436  """
437 
438  if not isinstance(state, str):
439  raise TypeError()
440 
441  parts = state.split("|")
442  if int(parts[0]) != 1:
443  raise ValueError()
444 
445  blockCount = int(parts[1])
446  if blockCount < 0:
447  raise ValueError()
448 
449  if len(parts) != 2 + 2 * blockCount:
450  raise ValueError()
451 
452  from .OnTheFlyStatistics import OnTheFlyStatistics
453  blocks = []
454  for i in range(0, blockCount):
455  block = OnTheFlyStatistics()
456  block.unserializeFromString(parts[2 + i])
457  blocks.append(block)
458 
459  waiting = []
460  for i in range(0, blockCount):
461  part = parts[2 + blockCount + i]
462  if len(part) == 0:
463  waiting.append(None)
464  else:
465  waiting.append(float(part))
466 
467 
468  if blockCount == 0:
469  blocks = [OnTheFlyStatistics()]
470  waiting = [None]
471 
472  self._blocks = blocks
473  self._waiting = waiting
474 
475 
476  def getMPLAxes(self, showEstimatedStandardDeviation = True):
477  """
478  Returns an `matplotlib.axes.Axes` object that plots the estimated
479  standard error of the mean, and optionally its estimated standard
480  deviation, as a function of the base-2 logarithm of the block size.
481 
482  @throw TypeError
483  Throws if any of the arguments have invalid types.
484 
485  @param[in] showEstimatedStandardDeviation
486  Whether to show, for each data point, the estimated standard
487  deviation of the estimated standard error of the mean.
488  Must be a `bool` value.
489  """
490 
491  if not isinstance(showEstimatedStandardDeviation, bool):
492  raise TypeError
493 
494 
495  import matplotlib.figure
496 
497  figure = matplotlib.figure.Figure()
498  axes = figure.add_subplot(1, 1, 1)
499 
500  blockSizes = []
501  values = []
502  for blockID in range(0, self.getMaximumBlockID() + 1):
503  if not self.hasBlockVariance(blockID):
504  continue
505 
506  blockSizes.append(blockID)
507  values.append(self.getBlockStandardErrorOfTheMean(blockID))
508 
509  errorbars = None
510  if showEstimatedStandardDeviation:
511  getSD = \
513  errorbars = []
514  for blockID in range(0, self.getMaximumBlockID() + 1):
515  if not self.hasBlockVariance(blockID):
516  continue
517  errorbars.append(getSD(blockID))
518 
519  axes.errorbar(blockSizes, values, yerr = errorbars)
520 
521  axes.set_xlabel(r'$ \log_2 \left( BlockSize \right) $')
522  axes.set_ylabel(r'Estimated Standard Error of the Mean')
523 
524  return axes
525 
526 
527  def __eq__(self, rhs):
528  """
529  Returns whether this instance and `rhs` are equivalent, i.e. whether the
530  two instances behave as if they had been supplied the same data.
531 
532  @throw TypeError
533  Throws if `rhs` is not of type `OnTheFlyStatisticsDDDA`.
534 
535  @param[in] rhs
536  The `OnTheFlyStatisticsDDDA` instance to compare to.
537  """
538 
539  if not isinstance(rhs, self.__class__):
540  raise TypeError()
541 
542  return self.__dict__ == rhs.__dict__
543 
544 
545  def __ne__(self, rhs):
546  """
547  Returns the negation of `self == rhs`.
548 
549  @throw TypeError
550  Throws if `rhs` is not of type `OnTheFlyStatisticsDDDA`.
551 
552  @param[in] rhs
553  The `OnTheFlyStatisticsDDDA` instance to compare to.
554  """
555 
556  return not self == rhs
557 
558 
559  def __repr__(self):
560  """
561  Returns a string (type `str`) representation of this instance.
562  """
563 
564  import math
565 
566  ret = "OnTheFlyStatisticsDDDA: {"
567  ret += "Size: " + str(self.getSampleSize())
568  if self.getSampleSize() >= 1:
569  ret += ", Mean: " + str(self.getSampleMean())
570  if self.getSampleSize() >= 2:
571  ret += ", Standard Deviation [blockID=0]: "
572  ret += str(math.sqrt(self.getBlockVariance(0)))
573  ret += ", Variance [blockID=0]: "
574  ret += str(self.getBlockVariance(0))
575  ret += "}\n"
576  return ret
577 
578 
579  def _addDatum(self, datum, blockID):
580  self._blocks[blockID].addDatum(datum)
581 
582  if self._waiting[blockID] is None:
583  self._waiting[blockID] = datum
584  return
585 
586  mean = (self._waiting[blockID] + datum) / 2.0
587  self._waiting[blockID] = None
588  if blockID + 1 == len(self._blocks):
589  from .OnTheFlyStatistics import OnTheFlyStatistics
590  self._blocks.append(OnTheFlyStatistics())
591  self._waiting.append(None)
592 
593  self._addDatum(mean, blockID + 1)
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getEstimatedStandardDeviationOfBlockStandardErrorOfTheMean
def getEstimatedStandardDeviationOfBlockStandardErrorOfTheMean(self, blockID)
Definition: OnTheFlyStatisticsDDDA.py:301
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getSampleStandardDeviation
def getSampleStandardDeviation(self)
Definition: OnTheFlyStatisticsDDDA.py:251
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA._blocks
_blocks
Definition: OnTheFlyStatisticsDDDA.py:39
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getBlockVariance
def getBlockVariance(self, blockID)
Definition: OnTheFlyStatisticsDDDA.py:207
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getMPLAxes
def getMPLAxes(self, showEstimatedStandardDeviation=True)
Definition: OnTheFlyStatisticsDDDA.py:510
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.unserializeFromString
def unserializeFromString(self, state)
Definition: OnTheFlyStatisticsDDDA.py:456
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA
Definition: OnTheFlyStatisticsDDDA.py:29
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getBlockStandardErrorOfTheMean
def getBlockStandardErrorOfTheMean(self, blockID)
Definition: OnTheFlyStatisticsDDDA.py:268
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.__init__
def __init__(self)
Definition: OnTheFlyStatisticsDDDA.py:35
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.optimalStandardErrorOfTheMeanEstimateIsReliable
def optimalStandardErrorOfTheMeanEstimateIsReliable(self)
Definition: OnTheFlyStatisticsDDDA.py:387
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getSampleMean
def getSampleMean(self)
Definition: OnTheFlyStatisticsDDDA.py:144
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA._addDatum
def _addDatum(self, datum, blockID)
Definition: OnTheFlyStatisticsDDDA.py:603
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getBlockStandardDeviation
def getBlockStandardDeviation(self, blockID)
Definition: OnTheFlyStatisticsDDDA.py:237
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.serializeToString
def serializeToString(self)
Definition: OnTheFlyStatisticsDDDA.py:426
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.__dict__
__dict__
Definition: OnTheFlyStatisticsDDDA.py:564
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.merge
def merge(self, rhs)
Definition: OnTheFlyStatisticsDDDA.py:84
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.hasBlockVariance
def hasBlockVariance(self, blockID)
Definition: OnTheFlyStatisticsDDDA.py:185
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.__eq__
def __eq__(self, rhs)
Definition: OnTheFlyStatisticsDDDA.py:559
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.__repr__
def __repr__(self)
Definition: OnTheFlyStatisticsDDDA.py:586
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getOptimalBlockIDForStandardErrorOfTheMean
def getOptimalBlockIDForStandardErrorOfTheMean(self)
Definition: OnTheFlyStatisticsDDDA.py:339
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getMaximumBlockSize
def getMaximumBlockSize(self)
Definition: OnTheFlyStatisticsDDDA.py:159
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.addDatum
def addDatum(self, datum)
Definition: OnTheFlyStatisticsDDDA.py:51
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA._waiting
_waiting
Definition: OnTheFlyStatisticsDDDA.py:40
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.__ne__
def __ne__(self, rhs)
Definition: OnTheFlyStatisticsDDDA.py:577
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getOptimalStandardErrorOfTheMean
def getOptimalStandardErrorOfTheMean(self)
Definition: OnTheFlyStatisticsDDDA.py:413
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getMaximumBlockID
def getMaximumBlockID(self)
Definition: OnTheFlyStatisticsDDDA.py:171
MPCDAnalysis.OnTheFlyStatisticsDDDA.OnTheFlyStatisticsDDDA.getSampleSize
def getSampleSize(self)
Definition: OnTheFlyStatisticsDDDA.py:125