StringDistance.h

Go to the documentation of this file.
00001 //# StringDistance.h: Class to deal with Levensthein distance of strings
00002 //# Copyright (C) 2009
00003 //# Associated Universities, Inc. Washington DC, USA.
00004 //#
00005 //# This library is free software; you can redistribute it and/or modify it
00006 //# under the terms of the GNU Library General Public License as published by
00007 //# the Free Software Foundation; either version 2 of the License, or (at your
00008 //# option) any later version.
00009 //#
00010 //# This library is distributed in the hope that it will be useful, but WITHOUT
00011 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
00012 //# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
00013 //# License for more details.
00014 //#
00015 //# You should have received a copy of the GNU Library General Public License
00016 //# along with this library; if not, write to the Free Software Foundation,
00017 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
00018 //#
00019 //# Correspondence concerning AIPS++ should be addressed as follows:
00020 //#        Internet email: aips2-request@nrao.edu.
00021 //#        Postal address: AIPS++ Project Office
00022 //#                        National Radio Astronomy Observatory
00023 //#                        520 Edgemont Road
00024 //#                        Charlottesville, VA 22903-2475 USA
00025 //#
00026 //# $Id$
00027 
00028 #ifndef CASA_STRINGDISTANCE_H
00029 #define CASA_STRINGDISTANCE_H
00030 
00031 //# Includes
00032 #include <casacore/casa/aips.h>
00033 #include <casacore/casa/BasicSL/String.h>
00034 #include <casacore/casa/Arrays/Matrix.h>
00035 
00036 namespace casacore {
00037 
00038 // <summary>
00039 // Class to deal with Levensthein distance of strings.
00040 // </summary>
00041 
00042 // <synopsis>
00043 // The Levenshtein Distance is a metric telling how similar strings are.
00044 // It is also known as the Edit Distance.
00045 //
00046 // The distance tells how many operations (i.e., character substitutions,
00047 // insertions, and deletions are needed to transform one string into another.
00048 // <br>There are several extensions to the basic definition:
00049 // <ul>
00050 //  <li> Treat a swap of adjacent characters as one operation.
00051 //  <li> Give a weight to operations (e.g., insertions and deletions have
00052 //       half the weight of the other operations).
00053 //  <li> Take locality into account. For example, successive substitutions
00054 //       weigh more than non-successive.
00055 //  <li> Extend to wildcarded strings.
00056 // </ul> 
00057 // This class optionally uses the swap extension. Furthermore one can
00058 // optionally ignore blanks. By default both options are used.
00059 //
00060 // The code is based on code written by Anders Sewerin Johansen.
00061 // Calculating the distance is an expensive O(N^2) operation, thus
00062 // should be used with care.
00063 //
00064 // The class is constructed with the source string to compare against.
00065 // Thereafter its <code>match</code> or <code>distance</code>
00066 // function can be used for each target string.
00067 // </synopsis>
00068 
00069 class StringDistance
00070 {
00071 public:
00072   // Default constructor sets maxDistance to 0.
00073   StringDistance();
00074 
00075   // Construct from the source string and maximum distance.
00076   // If the maximum distance is negative, it defaults to 1+strlength/3.
00077   // Note that maximum distance 0 means that the strings must match exactly.
00078   explicit StringDistance (const String& source, Int maxDistance=-1,
00079                            Bool countSwaps=True, Bool ignoreBlanks=True,
00080                            Bool caseInsensitive=False);
00081 
00082   // Get data members.
00083   // <group>
00084   const string& source() const
00085     { return itsSource; }
00086   Int maxDistance() const
00087     { return itsMaxDistance; }
00088   const Matrix<Int>& matrix() const
00089     { return itsMatrix; }
00090   // </group>
00091   
00092   // Test if the given target string is within the maximum distance.
00093   Bool match (const String& target) const;
00094 
00095   // Calculate the distance from the string to the string given in the constructor.
00096   // If the length of target exceeds source length + maxDistance,
00097   // the difference in lengths is returned.
00098   Int distance (const String& target) const;
00099 
00100   // Calculate the distance between the two strings.
00101   // This is slower than the <src>distance</src> member function, because
00102   // it has to allocate the underlying Matrix for each invocation.
00103   static Int distance (const String& source, const String& target,
00104                        Bool countSwaps=True);
00105 
00106   // Remove blanks from the given string.
00107   static String removeBlanks (const String& source);
00108 
00109 private:
00110   // Calculate the distance.
00111   static Int doDistance (const String& source, const String& target,
00112                          Bool countSwaps, Matrix<Int>& matrix);
00113 
00114 
00115 private:
00116   String              itsSource;
00117   mutable Matrix<Int> itsMatrix;
00118   Int                 itsMaxDistance;
00119   Bool                itsCountSwaps;
00120   Bool                itsIgnoreBlanks;
00121   Bool                itsCaseInsensitive;
00122 };
00123 
00124 } //# end namespace
00125 
00126 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

Generated on 31 Aug 2016 for casa by  doxygen 1.6.1