00001 //# StringDistance.h: Class to deal with Levensthein distance of strings 00002 //# Copyright (C) 2009 00003 //# Associated Universities, Inc. Washington DC, USA. 00004 //# 00005 //# This library is free software; you can redistribute it and/or modify it 00006 //# under the terms of the GNU Library General Public License as published by 00007 //# the Free Software Foundation; either version 2 of the License, or (at your 00008 //# option) any later version. 00009 //# 00010 //# This library is distributed in the hope that it will be useful, but WITHOUT 00011 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 00012 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public 00013 //# License for more details. 00014 //# 00015 //# You should have received a copy of the GNU Library General Public License 00016 //# along with this library; if not, write to the Free Software Foundation, 00017 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA. 00018 //# 00019 //# Correspondence concerning AIPS++ should be addressed as follows: 00020 //# Internet email: aips2-request@nrao.edu. 00021 //# Postal address: AIPS++ Project Office 00022 //# National Radio Astronomy Observatory 00023 //# 520 Edgemont Road 00024 //# Charlottesville, VA 22903-2475 USA 00025 //# 00026 //# $Id$ 00027 00028 #ifndef CASA_STRINGDISTANCE_H 00029 #define CASA_STRINGDISTANCE_H 00030 00031 //# Includes 00032 #include <casacore/casa/aips.h> 00033 #include <casacore/casa/BasicSL/String.h> 00034 #include <casacore/casa/Arrays/Matrix.h> 00035 00036 namespace casacore { 00037 00038 // <summary> 00039 // Class to deal with Levensthein distance of strings. 00040 // </summary> 00041 00042 // <synopsis> 00043 // The Levenshtein Distance is a metric telling how similar strings are. 00044 // It is also known as the Edit Distance. 00045 // 00046 // The distance tells how many operations (i.e., character substitutions, 00047 // insertions, and deletions are needed to transform one string into another. 00048 // <br>There are several extensions to the basic definition: 00049 // <ul> 00050 // <li> Treat a swap of adjacent characters as one operation. 00051 // <li> Give a weight to operations (e.g., insertions and deletions have 00052 // half the weight of the other operations). 00053 // <li> Take locality into account. For example, successive substitutions 00054 // weigh more than non-successive. 00055 // <li> Extend to wildcarded strings. 00056 // </ul> 00057 // This class optionally uses the swap extension. Furthermore one can 00058 // optionally ignore blanks. By default both options are used. 00059 // 00060 // The code is based on code written by Anders Sewerin Johansen. 00061 // Calculating the distance is an expensive O(N^2) operation, thus 00062 // should be used with care. 00063 // 00064 // The class is constructed with the source string to compare against. 00065 // Thereafter its <code>match</code> or <code>distance</code> 00066 // function can be used for each target string. 00067 // </synopsis> 00068 00069 class StringDistance 00070 { 00071 public: 00072 // Default constructor sets maxDistance to 0. 00073 StringDistance(); 00074 00075 // Construct from the source string and maximum distance. 00076 // If the maximum distance is negative, it defaults to 1+strlength/3. 00077 // Note that maximum distance 0 means that the strings must match exactly. 00078 explicit StringDistance (const String& source, Int maxDistance=-1, 00079 Bool countSwaps=True, Bool ignoreBlanks=True, 00080 Bool caseInsensitive=False); 00081 00082 // Get data members. 00083 // <group> 00084 const string& source() const 00085 { return itsSource; } 00086 Int maxDistance() const 00087 { return itsMaxDistance; } 00088 const Matrix<Int>& matrix() const 00089 { return itsMatrix; } 00090 // </group> 00091 00092 // Test if the given target string is within the maximum distance. 00093 Bool match (const String& target) const; 00094 00095 // Calculate the distance from the string to the string given in the constructor. 00096 // If the length of target exceeds source length + maxDistance, 00097 // the difference in lengths is returned. 00098 Int distance (const String& target) const; 00099 00100 // Calculate the distance between the two strings. 00101 // This is slower than the <src>distance</src> member function, because 00102 // it has to allocate the underlying Matrix for each invocation. 00103 static Int distance (const String& source, const String& target, 00104 Bool countSwaps=True); 00105 00106 // Remove blanks from the given string. 00107 static String removeBlanks (const String& source); 00108 00109 private: 00110 // Calculate the distance. 00111 static Int doDistance (const String& source, const String& target, 00112 Bool countSwaps, Matrix<Int>& matrix); 00113 00114 00115 private: 00116 String itsSource; 00117 mutable Matrix<Int> itsMatrix; 00118 Int itsMaxDistance; 00119 Bool itsCountSwaps; 00120 Bool itsIgnoreBlanks; 00121 Bool itsCaseInsensitive; 00122 }; 00123 00124 } //# end namespace 00125 00126 #endif