StatisticsAlgorithm.h

Go to the documentation of this file.
00001 //# Copyright (C) 2000,2001
00002 //# Associated Universities, Inc. Washington DC, USA.
00003 //#
00004 //# This library is free software; you can redistribute it and/or modify it
00005 //# under the terms of the GNU Library General Public License as published by
00006 //# the Free Software Foundation; either version 2 of the License, or (at your
00007 //# option) any later version.
00008 //#
00009 //# This library is distributed in the hope that it will be useful, but WITHOUT
00010 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
00011 //# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
00012 //# License for more details.
00013 //#
00014 //# You should have received a copy of the GNU Library General Public License
00015 //# along with this library; if not, write to the Free Software Foundation,
00016 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
00017 //#
00018 //# Correspondence concerning AIPS++ should be addressed as follows:
00019 //#        Internet email: aips2-request@nrao.edu.
00020 //#        Postal address: AIPS++ Project Office
00021 //#                        National Radio Astronomy Observatory
00022 //#                        520 Edgemont Road
00023 //#                        Charlottesville, VA 22903-2475 USA
00024 //#
00025 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
00026 
00027 #ifndef SCIMATH_STATSALGORITHM_H
00028 #define SCIMATH_STATSALGORITHM_H
00029 
00030 #include <casacore/casa/aips.h>
00031 #include <casacore/casa/Exceptions/Error.h>
00032 #include <casacore/casa/Utilities/CountedPtr.h>
00033 #include <casacore/scimath/Mathematics/StatsDataProvider.h>
00034 #include <casacore/scimath/Mathematics/StatisticsData.h>
00035 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
00036 
00037 #include <map>
00038 #include <set>
00039 #include <vector>
00040 
00041 // because the template signature has become unwieldy
00042 #define CASA_STATD template <class AccumType, class DataIterator, class MaskIterator, class WeightsIterator>
00043 #define CASA_STATP AccumType, DataIterator, MaskIterator, WeightsIterator
00044 
00045 namespace casacore {
00046 
00047 // Base class of statistics algorithm class hierarchy.
00048 
00049 // The default implementation is such that statistics are only calculated when
00050 // getStatistic() or getStatistics() is called. Until then, the iterators which
00051 // point to the beginning of data sets, masks, etc. are held in memory. Thus,
00052 // the caller must keep all data sets available for the statistics object until
00053 // these methods are called, and of course, if the actual data values are changed
00054 // between adding data and calculating statistics, the updated values are used when
00055 // calculating statistics. Derived classes may override this behavior.
00056 //
00057 // PRECISION CONSIDERATIONS
00058 // Many statistics are computed via accumulators. This can lead to precision issues,
00059 // especially for large datasets. For this reason, it is highly recommended that the
00060 // data type one uses as the AccumType be of higher precision, if possible, than the
00061 // data type pointed to by input iterator. So for example, if one has a data set of
00062 // Float values (to which the InputIterator type points to), then one should use type
00063 // Double for the AccumType. In this case, the Float data values will be converted to
00064 // Doubles before they are accumulated.
00065 //
00066 // METHODS OF PROVIDING DATA
00067 // Data may be provided in one of two mutually exclusive ways. The first way is
00068 // simpler, and that is to use the setData()/addData() methods. Calling setData() will
00069 // clear any previous data that was added via these methods or via a data provider (see
00070 // below). Calling addData() subsequently to setData() will add a data set to the set
00071 // of data sets on which statistics will be calculated. In order for this to work
00072 // correctly, the iterators which are passed into these methods must still be valid when
00073 // statistics are calculated (although note that some derived classes allow certain
00074 // statistics to be updated as data sets are added via these methods. See specific
00075 // classes for details).
00076 //
00077 // The second way to provide data is via a data provider. This takes the form of
00078 // a derived class of StatsDataProvider, in which various methods are implemented for
00079 // retrieving various information about the data sets. Such an interface is necessary for
00080 // data which does not easily lend itself to be provided via the setData()/addData()
00081 // methods. For example, in the case of iterating through a lattice, a lattice iterator
00082 // will overwrite the memory location of the previous chunk of data with the current chunk
00083 // of data. Therefore, if one does not wish to load data from the entire lattice into
00084 // memory (which is why LatticeIterator was designed in this way), one must the
00085 // LatticeStatsDataProvider class, which the statistics framework will use to iteratate
00086 // through the lattice, only keeping one chunk of the data of the lattice in memory at one
00087 // time.
00088 //
00089 // QUANTILES
00090 // A quantile is a value contained in a data set, such that, it has a zero-based
00091 // index of ceil(q*n)-1 in the equivalent ordered dataset, where 0 < q < 1 specifies
00092 // the fractional location within the ordered dataset and n is the total number of elements.
00093 // Note that, for a dataset with an odd number of elements, the median is the same as
00094 // the quantile value when q = 0.5. However, there is no such correspondance between the
00095 // median in a dataset with an even number of elements, since the median in that case is
00096 // given by the mean of the elements of zero-based indeces n/2-1 and n/2 in the equivalent
00097 // ordered dataset. Thus, in the case of a dataset with an even number of values, the
00098 // median may not even exist in the dataset, while a quantile value must exist in the
00099 // dataset.  Note when calculating quantile values, a dataset that does not fall in
00100 // specified dataset ranges, is not included via a stride specification, is masked, or
00101 // has a weight of zero is not considered a member of the dataset for the pruposes of
00102 // quantile calculations.
00103 
00104 template <class AccumType, class DataIterator, class MaskIterator=const Bool *, class WeightsIterator=DataIterator>
00105 class StatisticsAlgorithm {
00106 
00107 public:
00108 
00109     virtual ~StatisticsAlgorithm();
00110 
00111     // <group>
00112     // Add a dataset to an existing set of datasets on which statistics are
00113     // to be calculated. nr is the number of points to be considered.
00114     // If <src>dataStride</src> is greater than 1, when <src>nrAccountsForStride</src>=True indicates
00115     // that the stride has been taken into account in the value of <src>nr</src>. Otherwise, it has
00116     // not so that the actual number of points to include is nr/dataStride if nr % dataStride == 0 or
00117     // (int)(nr/dataStride) + 1 otherwise.
00118     // If one calls this method after a data provider has been set, an exception will be
00119     // thrown. In this case, one should call setData(), rather than addData(), to indicate
00120     // that the underlying data provider should be removed.
00121     // <src>dataRanges</src> provide the ranges of data to include if <src>isInclude</src> is True,
00122     // or ranges of data to exclude if <src>isInclude</src> is False. If a datum equals the end point
00123     // of a data range, it is considered good (included) if <src>isInclude</src> is True, and it is
00124     // considered bad (excluded) if <src>isInclude</src> is False.
00125 
00126     virtual void addData(
00127         const DataIterator& first, uInt nr, uInt dataStride=1,
00128         Bool nrAccountsForStride=False
00129     );
00130 
00131     virtual void addData(
00132         const DataIterator& first, uInt nr,
00133         const DataRanges& dataRanges, Bool isInclude=True, uInt dataStride=1,
00134         Bool nrAccountsForStride=False
00135     );
00136 
00137     virtual void addData(
00138         const DataIterator& first, const MaskIterator& maskFirst,
00139         uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False, uInt maskStride=1
00140     );
00141 
00142     virtual void addData(
00143         const DataIterator& first, const MaskIterator& maskFirst,
00144         uInt nr, const DataRanges& dataRanges,
00145         Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
00146         uInt maskStride=1
00147     );
00148 
00149     virtual void addData(
00150         const DataIterator& first, const WeightsIterator& weightFirst,
00151         uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False
00152     );
00153 
00154     virtual void addData(
00155         const DataIterator& first, const WeightsIterator& weightFirst,
00156         uInt nr, const DataRanges& dataRanges,
00157         Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False
00158     );
00159 
00160     virtual void addData(
00161         const DataIterator& first, const WeightsIterator& weightFirst,
00162         const MaskIterator& maskFirst, uInt nr, uInt dataStride=1,
00163         Bool nrAccountsForStride=False,
00164         uInt maskStride=1
00165     );
00166 
00167     virtual void addData(
00168         const DataIterator& first, const WeightsIterator& weightFirst,
00169         const MaskIterator& maskFirst, uInt nr, const DataRanges& dataRanges,
00170         Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
00171         uInt maskStride=1
00172     );
00173     // </group>
00174 
00175     // get the algorithm that this object uses for computing stats
00176     virtual StatisticsData::ALGORITHM algorithm() const = 0;
00177 
00178     // delete any (partially) sorted array
00179     void deleteSortedArray();
00180 
00181     virtual AccumType getMedian(
00182         CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
00183         CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
00184         Bool persistSortedArray=False, uInt64 nBins=10000
00185     ) = 0;
00186 
00187     // The return value is the median; the quantiles are returned in the <src>quantileToValue</src> map.
00188     virtual AccumType getMedianAndQuantiles(
00189         std::map<Double, AccumType>& quantileToValue, const std::set<Double>& quantiles,
00190         CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
00191         CountedPtr<AccumType> knownMax=NULL,
00192         uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
00193         uInt64 nBins=10000
00194     ) = 0;
00195 
00196     // get the median of the absolute deviation about the median of the data.
00197     virtual AccumType getMedianAbsDevMed(
00198         CountedPtr<uInt64> knownNpts=NULL,
00199         CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
00200         uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
00201         uInt64 nBins=10000
00202     ) = 0;
00203 
00204     AccumType getQuantile(
00205         Double quantile, CountedPtr<uInt64> knownNpts=NULL,
00206         CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
00207         uInt binningThreshholdSizeBytes=4096*4096,
00208         Bool persistSortedArray=False, uInt64 nBins=10000
00209     );
00210 
00211     // get a map of quantiles to values.
00212     virtual std::map<Double, AccumType> getQuantiles(
00213         const std::set<Double>& quantiles, CountedPtr<uInt64> npts=NULL,
00214         CountedPtr<AccumType> min=NULL, CountedPtr<AccumType> max=NULL,
00215         uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
00216         uInt64 nBins=10000
00217     ) = 0;
00218 
00219     // get the value of the specified statistic
00220     virtual AccumType getStatistic(StatisticsData::STATS stat);
00221 
00222     // certain statistics such as max and min have locations in the dataset
00223     // associated with them. This method gets those locations. The first value
00224     // in the returned pair is the zero-based dataset number that was set or
00225     // added. The second value is the zero-based index in that dataset. A data stride
00226     // of greater than one is not accounted for, so the index represents the actual
00227     // location in the data set, independent of the dataStride value.
00228     virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat) = 0;
00229 
00230     virtual StatsData<AccumType> getStatistics();
00231 
00232     // <group>
00233     // setdata() clears any current datasets or data provider and then adds the specified data set as
00234     // the first dataset in the (possibly new) set of data sets for which statistics are
00235     // to be calculated. See addData() for parameter meanings.
00236     virtual void setData(const DataIterator& first, uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False);
00237 
00238     virtual void setData(
00239         const DataIterator& first, uInt nr,
00240         const DataRanges& dataRanges, Bool isInclude=True, uInt dataStride=1,
00241         Bool nrAccountsForStride=False
00242     );
00243 
00244     virtual void setData(
00245         const DataIterator& first, const MaskIterator& maskFirst,
00246         uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False,
00247         uInt maskStride=1
00248     );
00249 
00250     virtual void setData(
00251         const DataIterator& first, const MaskIterator& maskFirst,
00252         uInt nr, const DataRanges& dataRanges,
00253         Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
00254         uInt maskStride=1
00255     );
00256 
00257     virtual void setData(
00258         const DataIterator& first, const WeightsIterator& weightFirst,
00259         uInt nr, uInt dataStride=1,
00260         Bool nrAccountsForStride=False
00261     );
00262 
00263     virtual void setData(
00264         const DataIterator& first, const WeightsIterator& weightFirst,
00265         uInt nr, const DataRanges& dataRanges,
00266         Bool isInclude=True, uInt dataStride=1,
00267         Bool nrAccountsForStride=False
00268     );
00269 
00270     virtual void setData(
00271         const DataIterator& first, const WeightsIterator& weightFirst,
00272         const MaskIterator& maskFirst, uInt nr, uInt dataStride=1,
00273         Bool nrAccountsForStride=False,
00274         uInt maskStride=1
00275     );
00276 
00277     virtual void setData(
00278         const DataIterator& first, const WeightsIterator& weightFirst,
00279         const MaskIterator& maskFirst, uInt nr, const DataRanges& dataRanges,
00280         Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
00281         uInt maskStride=1
00282     );
00283     // </group>
00284 
00285     // instead of settng and adding data "by hand", set the data provider that will provide
00286     // all the data sets. Calling this method will clear any other data sets that have
00287     // previously been set or added.
00288     virtual void setDataProvider(StatsDataProvider<CASA_STATP> *dataProvider) {
00289         ThrowIf(! dataProvider, "Logic Error: data provider cannot be NULL");
00290         _clearData();
00291         _dataProvider = dataProvider;
00292     }
00293 
00294     // Provide guidance to algorithms by specifying a priori which statistics the
00295     // caller would like calculated.
00296     virtual void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
00297 
00298 protected:
00299     StatisticsAlgorithm();
00300 
00301     // use copy semantics
00302     StatisticsAlgorithm<CASA_STATP>& operator=(
00303         const StatisticsAlgorithm<CASA_STATP>& other
00304     );
00305 
00306     // Allows derived classes to do things after data is set or added.
00307     // Default implementation does nothing.
00308     virtual void _addData() {}
00309 
00310     virtual void _clearData();
00311 
00312     const vector<Int64>& _getCounts() const { return _counts; }
00313 
00314     const vector<DataIterator>& _getData() const { return _data; }
00315 
00316     StatsDataProvider<CASA_STATP>* _getDataProvider() {
00317         return _dataProvider;
00318     }
00319 
00320     const vector<uInt>& _getDataStrides() const { return _dataStrides; }
00321 
00322     const std::map<uInt, Bool>& _getIsIncludeRanges() const { return _isIncludeRanges; }
00323 
00324     const std::map<uInt, MaskIterator> _getMasks() const { return _masks; }
00325 
00326     const std::map<uInt, uInt>& _getMaskStrides() const { return _maskStrides; }
00327 
00328     const std::map<uInt, DataRanges>& _getRanges() const { return _dataRanges; }
00329 
00330     virtual AccumType _getStatistic(StatisticsData::STATS stat) = 0;
00331 
00332     virtual StatsData<AccumType> _getStatistics() = 0;
00333 
00334     const std::set<StatisticsData::STATS> _getStatsToCalculate() const {
00335         return _statsToCalculate;
00336     }
00337 
00338     std::vector<AccumType>& _getSortedArray() { return _sortedArray; }
00339 
00340     virtual const std::set<StatisticsData::STATS>& _getUnsupportedStatistics() const {
00341         return _unsupportedStats;
00342     }
00343 
00344     const std::map<uInt, WeightsIterator>& _getWeights() const {
00345         return _weights;
00346     }
00347 
00348     /*
00349     // get the zero-based indices of the specified quantiles in sorted dataset with npts
00350     // number of good points. The returned map maps quantiles to indices.
00351     static std::map<Double, uInt64> _indicesFromQuantiles(
00352         uInt64 npts, const std::set<Double>& quantiles
00353     );
00354     */
00355 
00356     // The array can be changed by paritally sorting it up to the largest index. Return
00357     // a map of index to value in the sorted array.
00358     static std::map<uInt64, AccumType> _valuesFromArray(
00359         vector<AccumType>& myArray, const std::set<uInt64>& indices
00360     );
00361 
00362     void _setSortedArray(const vector<AccumType>& v) { _sortedArray = v; }
00363 
00364 private:
00365     vector<DataIterator> _data;
00366     // maps data to weights
00367     std::map<uInt, WeightsIterator> _weights;
00368     // maps data to masks
00369     std::map<uInt, MaskIterator> _masks;
00370     vector<Int64> _counts;
00371     vector<uInt> _dataStrides;
00372     std::map<uInt, uInt> _maskStrides;
00373     std::map<uInt, Bool> _isIncludeRanges;
00374     std::map<uInt, DataRanges> _dataRanges;
00375     vector<AccumType> _sortedArray;
00376     std::set<StatisticsData::STATS> _statsToCalculate, _unsupportedStats;
00377     StatsDataProvider<CASA_STATP> *_dataProvider;
00378 
00379     void _throwIfDataProviderDefined() const;
00380 };
00381 
00382 }
00383 
00384 #ifndef CASACORE_NO_AUTO_TEMPLATES
00385 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.tcc>
00386 #endif
00387 
00388 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

Generated on 31 Aug 2016 for casa by  doxygen 1.6.1