############################################################################### ## ## ## ALEXANDRIA DIGITAL LIBRARY ## ## University of California at Santa Barbara ## ## ## ## ------------------------------------------------------------------------- ## ## ## ## Copyright (c) 2003 by the Regents of the University of California ## ## All rights reserved ## ## ## ## Redistribution and use in source and binary forms, with or without ## ## modification, are permitted provided that the following conditions are ## ## met: ## ## ## ## 1. Redistributions of source code must retain the above copyright ## ## notice, this list of conditions, and the following disclaimer. ## ## ## ## 2. Redistributions in binary form must reproduce the above copyright ## ## notice, this list of conditions, and the following disclaimer in ## ## the documentation and/or other materials provided with the ## ## distribution. ## ## ## ## 3. All advertising materials mentioning features or use of this ## ## software must display the following acknowledgement: This product ## ## includes software developed by the Alexandria Digital Library, ## ## University of California at Santa Barbara, and its contributors. ## ## ## ## 4. Neither the name of the University nor the names of its ## ## contributors may be used to endorse or promote products derived ## ## from this software without specific prior written permission. ## ## ## ## THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS" AND ANY ## ## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ## ## WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE ## ## DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ## ## ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ## ## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ## ## OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ## ## HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ## ## STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ## ## ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ## ## POSSIBILITY OF SUCH DAMAGE. ## ## ## ############################################################################### # $Header: /export/home/gjanee/bucket99/paradigms/RCS/Textual_LikeDelimitedSubstring.py,v 1.4 2003/10/21 20:34:37 gjanee Exp $ # SYNOPSIS # # Textual_LikeDelimitedSubstring (table, idColumn, textColumn, # delimiter, cardinality, # mapping=TextUtils.mappings.uppercaseAlphanumericOthersToWhitespace, # deleteList=TextUtils.deleteLists.keepAll, function=None) # # table # A table to query, e.g., "holding". # # idColumn # The table's object identifier column (i.e., the column # to be selected), e.g., "holding_id". # # textColumn # The table column containing the text to search over # (i.e., the column against which the constraint is to be # placed), e.g., "subject_text". # # delimiter # A single character that serves to delimit words in # 'textColumn', e.g., "^". # # cardinality # A Cardinality object representing the cardinality of # 'table' with respect to 'textColumn'. # # mapping # A Python character mapping table (i.e., a string of # length 256, indexed by ASCII character code) to process # constraint text with. Defaults to # 'uppercaseAlphanumericOthersToWhitespace', which maps # alphanumeric characters to their uppercase equivalents # and all other characters to whitespace (i.e., to word # separators). # # deleteList # A string of zero or more characters to delete from # constraint text. The default is the empty string, which # keeps all characters. # # function # A function to apply to 'textColumn' (e.g., "UPPER"), or # None. Defaults to None. # # DESCRIPTION # # Translates a textual constraint to a boolean combination of one # or more substring matches using SQL LIKE operators. # # This paradigm assumes that the text in 'textColumn' has been # encoded such that words are delimited by a common delimiter # character and phrases are separated by two or more delimiter # characters. For example, assuming the delimiter character is # "^", a column value containing the two phrases "I am Sam" and # "Sam I am" would be encoded as: # # ^I^am^Sam^^Sam^I^am^ # # Given a textual constraint (B, O, T) where B is a textual # bucket, O is one of the standard textual operators, and T is a # text string, this paradigm parses T into a sequence of one or # more words (W1, W2, W3, ...) by: 1) deleting from T any # characters that appear in 'deleteList'; 2) mapping the remaining # characters using 'mapping'; and 3) treating sequences of # whitespace characters as word separators. The paradigm then # returns one of the following queries (we use "^" here to # represent the delimiter character). If O is # "contains-all-words": # # SELECT idColumn FROM table # WHERE textColumn LIKE '%^W1^%' AND # textColumn LIKE '%^W2^%' AND # textColumn LIKE '%^W3^%' ... # # If O is "contains-any-words": # # SELECT idColumn FROM table # WHERE textColumn LIKE '%^W1^%' OR # textColumn LIKE '%^W2^%' OR # textColumn LIKE '%^W3^%' ... # # If O is "contains-phrase": # # SELECT idColumn FROM table # WHERE textColumn LIKE '%^W1^W2^W3^...^%' # # If a text column function is specified (e.g., "UPPER"), the # returned query will have the form: # # SELECT idColumn FROM table # WHERE UPPER(textColumn) LIKE ... # # Under certain circumstances the query # # SELECT idColumn FROM table # WHERE 1 = 0 # # may be returned. # # The semantics of the "contains-all-words" operator will # generally be correct only if the cardinality is "1" or "1?". If # the cardinality is "0+" or "1+", wrap this paradigm in an # Adaptor_IndivisibleConcatenation paradigm. # # Exceptions thrown: # # no query words specified # # AUTHOR # # Greg Janee # gjanee@alexandria.ucsb.edu # # HISTORY # # $Log: Textual_LikeDelimitedSubstring.py,v $ # Revision 1.4 2003/10/21 20:34:37 gjanee # Minor (but critical) documentation change. # # Revision 1.3 2003/01/29 21:13:50 gjanee # Recoded slightly to take advantage of new paradigm convenience # functions. # # Revision 1.2 2003/01/24 04:14:12 gjanee # Minor update to conform to the "transparent immutable objects" # programming model. Fixed an obscure bug. # # Revision 1.1 2002/10/31 22:32:13 gjanee # Initial revision # import string import types import edu.ucsb.adl.middleware M = edu.ucsb.adl.middleware import UniversalTranslator UT = UniversalTranslator import paradigms P = paradigms class Textual_LikeDelimitedSubstring (UT.Paradigm): def __init__ (self, table, idColumn, textColumn, delimiter, cardinality, mapping=P.TextUtils.mappings.uppercaseAlphanumericOthersToWhitespace, deleteList=P.TextUtils.deleteLists.keepAll, function=None): UT.assertType(table, types.StringType) UT.assertType(idColumn, types.StringType) UT.assertType(textColumn, types.StringType) UT.assertType(delimiter, types.StringType) assert len(delimiter) == 1, "delimiter has length " +\ str(len(delimiter)) + ", should be 1" UT.assertType(cardinality, UT.Cardinality) UT.assertType(mapping, types.StringType) assert len(mapping) == 256, "character mapping table has length " +\ str(len(mapping)) + ", should be 256" UT.assertType(deleteList, types.StringType) UT.assertPolytype(function, [types.StringType, types.NoneType]) self.table = table self.idColumn = idColumn self.textColumn = textColumn self.delimiter = delimiter self.cardinality = cardinality self.mapping = mapping self.deleteList = deleteList self.function = function def translateBucketAtomic (self, constraint, vocabularies): UT.assertType(constraint, M.Query.TextualConstraint) assert constraint.getOperator() in UT.standardTextualOperators,\ "unsupported operator: " + constraint.getOperator() wordList = string.split(string.translate(constraint.getText(), self.mapping, self.deleteList)) if len(wordList) == 0: raise UT.QueryError, "no query words specified in constraint " +\ "on bucket '" + constraint.getBucket() + "'" table = UT.TableRef(self.table) wl = [] for word in wordList: if string.find(word, self.delimiter) < 0: wl += [word] if ((constraint.getOperator() == "contains-all-words" or\ constraint.getOperator() == "contains-phrase") and\ len(wl) < len(wordList)) or\ (constraint.getOperator() == "contains-any-words" and\ len(wl) == 0): return UT.constantFalseQuery(self.table, self.idColumn, self.cardinality) wordList = wl if self.function != None: prefix = [self.function + "(", table, "." + self.textColumn + ")"] else: prefix = [table, "." + self.textColumn] delimiter = UT.protectLikeWildcards(self.delimiter) if constraint.getOperator() == "contains-phrase": clause = "'%" + delimiter for word in wordList: clause = clause + UT.protectLikeWildcards(word) + delimiter clause = clause + "%'" if string.find(clause, "\\") >= 0: clause = clause + " ESCAPE '\\'" expression = UT.Expression(prefix + [" LIKE " + clause]) else: expression = UT.Expression([]) for word in wordList: clause = "'%" + delimiter + UT.protectLikeWildcards(word) +\ delimiter + "%'" if string.find(clause, "\\") >= 0: clause = clause + " ESCAPE '\\'" e = UT.Expression(prefix + [" LIKE " + clause]) if constraint.getOperator() == "contains-all-words": expression = expression.combine("AND", e) elif constraint.getOperator() == "contains-any-words": expression = expression.combine("OR", e) else: UT.unhandledCase() return UT.Select( [UT.MainFrom(table, self.idColumn, self.cardinality)], expression)