############################################################################### ## ## ## ALEXANDRIA DIGITAL LIBRARY ## ## University of California at Santa Barbara ## ## ## ## ------------------------------------------------------------------------- ## ## ## ## Copyright (c) 2003 by the Regents of the University of California ## ## All rights reserved ## ## ## ## Redistribution and use in source and binary forms, with or without ## ## modification, are permitted provided that the following conditions are ## ## met: ## ## ## ## 1. Redistributions of source code must retain the above copyright ## ## notice, this list of conditions, and the following disclaimer. ## ## ## ## 2. Redistributions in binary form must reproduce the above copyright ## ## notice, this list of conditions, and the following disclaimer in ## ## the documentation and/or other materials provided with the ## ## distribution. ## ## ## ## 3. All advertising materials mentioning features or use of this ## ## software must display the following acknowledgement: This product ## ## includes software developed by the Alexandria Digital Library, ## ## University of California at Santa Barbara, and its contributors. ## ## ## ## 4. Neither the name of the University nor the names of its ## ## contributors may be used to endorse or promote products derived ## ## from this software without specific prior written permission. ## ## ## ## THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS" AND ANY ## ## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ## ## WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE ## ## DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ## ## ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ## ## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ## ## OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ## ## HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ## ## STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ## ## ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ## ## POSSIBILITY OF SUCH DAMAGE. ## ## ## ############################################################################### # $Header: /export/home/gjanee/bucket99/paradigms/RCS/Textual_MySQLFulltext.py,v 1.1 2003/11/06 04:41:41 gjanee Exp $ # SYNOPSIS # # Textual_MySQLFulltext (table, idColumn, textColumns, cardinality, # regexpPhraseFilter=1, # mapping=TextUtils.mappings.nonAlphanumericToWhitespace, # deleteList=TextUtils.deleteLists.keepAll) # # table # A table to query, e.g., "holding". # # idColumn # The table's identifier column (i.e., the column to be # selected), e.g., "holding_id". # # textColumns # A single table column (e.g., "subject_text") or a list # of one or more table columns (e.g., ["subject_text", # "assigned_terms"]) containing the text to search over. # # cardinality # A Cardinality object representing the cardinality of # 'table' with respect to the column or columns listed in # 'textColumns'. # # regexpPhraseFilter # A boolean that indicates if "contains-phrase" # constraints are to be translated as "contains-all-words" # constraints conjoined with REGEXP conditions. Defaults # to true. See below. # # mapping # A Python character mapping table (i.e., a string of # length 256, indexed by ASCII character code) to process # constraint text with. Defaults to # 'nonAlphanumericToWhitespace', which maps # non-alphanumeric characters to whitespace (i.e., to word # separators). # # deleteList # A string of zero or more characters to delete from # constraint text. The default is the empty string, which # keeps all characters. # # DESCRIPTION # # Translates a textual constraint to a MySQL fulltext index # search. The returned query has the general form # # SELECT idColumn FROM table # WHERE MATCH (textColumns, ...) # AGAINST ('expression' IN BOOLEAN MODE) # # where 'expression' is a string expression whose form depends on # the constraint operator. In the following, let W1, W2, W3, ..., # Wn be the words formed from the constraint text T by 1) deleting # from T any characters that appear in 'deleteList'; 2) mapping # the remaining characters using 'mapping'; and 3) treating # sequences of whitespace characters as word separators. Then the # query expression is: # # contains-any-words # W1 W2 W3 ... Wn # # contains-all-words # +W1 +W2 +W3 ... +Wn # # contains-phrase # "W1 W2 W3 ... Wn" # # Note that MySQL's phrase matching (as of version 4.1.0alpha) is # essentially simple substring matching, and thus will have poor # recall performance unless the text in the table has been # appropriately processed beforehand (namely, adjacent words # within a phrase must be separated by exactly one space). But if # the 'regexpPhraseFilter' argument is true, and if the query # phrase contains more than one word, then the returned query has # the alternate form # # SELECT idColumn FROM table # WHERE MATCH (textColumns, ...) # AGAINST ('+W1 +W2 +W3 ... +Wn' IN BOOLEAN MODE) AND # (textColumn1 REGEXP # '[[:<:]]W1[[:space:]]+W2[[:space:]]+...Wn[[:>:]]' # OR textColumn2 REGEXP # '[[:<:]]W1[[:space:]]+W2[[:space:]]+...Wn[[:>:]]' # OR ...) # # I.e., the REGEXP filter more forgivingly allows adjacent words # within a phrase to be separated by one or more whitespace # characters. # # The semantics of the "contains-all-words" operator will # generally be correct only if the cardinality is "1" or "1?". If # the cardinality is "0+" or "1+", wrap this paradigm in an # Adaptor_IndivisibleConcatenation paradigm. # # This paradigm assumes that the text processing specified by # 'mapping' and 'deleteList' is compatible with MySQL's notion of # words, which it is, by default. # # Exceptions thrown: # # no query words specified # # AUTHOR # # Greg Janee # gjanee@alexandria.ucsb.edu # # HISTORY # # $Log: Textual_MySQLFulltext.py,v $ # Revision 1.1 2003/11/06 04:41:41 gjanee # Initial revision # import string import types import edu.ucsb.adl.middleware M = edu.ucsb.adl.middleware import UniversalTranslator UT = UniversalTranslator import paradigms P = paradigms def _formAnyExpression (wordList): UT.assertType(wordList, types.ListType) UT.assertListElementType(wordList, types.StringType) s = "" for word in wordList: if s != "": s += " " s += word return s def _formAllExpression (wordList): UT.assertType(wordList, types.ListType) UT.assertListElementType(wordList, types.StringType) s = "" for word in wordList: if s != "": s += " " s += "+" + word return s def _formPhraseExpression (wordList): UT.assertType(wordList, types.ListType) UT.assertListElementType(wordList, types.StringType) return '"' + _formAnyExpression(wordList) + '"' def _protectRegexpSpecials (word): UT.assertType(word, types.StringType) w = "" for i in range(len(word)): if "^$.*+?|(){}[]\\".find(word[i]) >= 0: if word[i] == "\\": w += "\\\\\\" else: w += "\\\\" w += word[i] return w def _formRegexp (wordList): UT.assertType(wordList, types.ListType) UT.assertListElementType(wordList, types.StringType) s = "" for word in wordList: if s != "": s += "[[:space:]]+" s += _protectRegexpSpecials(word) return "[[:<:]]" + s + "[[:>:]]" class Textual_MySQLFulltext (UT.Paradigm): def __init__ (self, table, idColumn, textColumns, cardinality, regexpPhraseFilter=1, mapping=P.TextUtils.mappings.nonAlphanumericToWhitespace, deleteList=P.TextUtils.deleteLists.keepAll): UT.assertType(table, types.StringType) self.table = table UT.assertType(idColumn, types.StringType) self.idColumn = idColumn UT.assertPolytype(textColumns, [types.StringType, types.ListType]) if isinstance(textColumns, types.StringType): self.textColumns = [textColumns] else: UT.assertListElementType(textColumns, types.StringType) UT.assertListNonempty(textColumns) UT.assertListNoDuplicates(textColumns) self.textColumns = textColumns UT.assertType(cardinality, UT.Cardinality) self.cardinality = cardinality UT.assertType(regexpPhraseFilter, types.IntType) self.regexpPhraseFilter = regexpPhraseFilter UT.assertType(mapping, types.StringType) assert len(mapping) == 256, "character mapping table has length " +\ str(len(mapping)) + ", should be 256" self.mapping = mapping UT.assertType(deleteList, types.StringType) self.deleteList = deleteList def translateBucketAtomic (self, constraint, vocabularies): UT.assertType(constraint, M.Query.TextualConstraint) assert constraint.getOperator() in UT.standardTextualOperators,\ "unsupported operator: " + constraint.getOperator() wordList = self._parseConstraintText(constraint) regexpList = [] if len(wordList) == 1 or\ constraint.getOperator() == "contains-any-words": expression = _formAnyExpression(wordList) elif constraint.getOperator() == "contains-all-words": expression = _formAllExpression(wordList) elif constraint.getOperator() == "contains-phrase": if self.regexpPhraseFilter: expression = _formAllExpression(wordList) regexpList = [_formRegexp(wordList)] else: expression = _formPhraseExpression(wordList) else: UT.unhandledCase() return self._process(expression, regexpList) def translateBucketBoolean (self, operator, constraints, vocabularies): UT.assertBooleanOperator(operator) UT.assertType(constraints, types.ListType) UT.assertListElementType(constraints, M.Query.TextualConstraint) UT.assertBooleanOperatorOperandConsistency(operator, constraints) UT.assertListElementCommonValue(constraints, lambda c: c.getBucket()) UT.assertListElementPredicateAll(constraints, lambda c: c.getOperator() in UT.standardTextualOperators) if operator == "AND": expression = "" regexpList = [] for constraint in constraints: wordList = self._parseConstraintText(constraint) if constraint.getOperator() == "contains-any-words": if expression != "": expression += " " expression += "+(" + _formAnyExpression(wordList) + ")" elif constraint.getOperator() == "contains-all-words": if expression != "": expression += " " expression += _formAllExpression(wordList) elif constraint.getOperator() == "contains-phrase": if self.regexpPhraseFilter: if expression != "": expression += " " expression += _formAllExpression(wordList) regexpList += [_formRegexp(wordList)] else: if expression != "": expression += " " expression += "+" + _formPhraseExpression(wordList) else: UT.unhandledCase() return self._process(expression, regexpList) elif operator == "OR": expression = "" for constraint in constraints: wordList = self._parseConstraintText(constraint) if constraint.getOperator() == "contains-any-words": if expression != "": expression += " " expression += _formAnyExpression(wordList) elif constraint.getOperator() == "contains-all-words": if expression != "": expression += " " expression += "(" + _formAllExpression(wordList) + ")" elif constraint.getOperator() == "contains-phrase": if self.regexpPhraseFilter: # There's no way to add a conjunctive REGEXP # condition that applies only when the primary # condition succeeds because the phrase words # are found, ergo: return None else: if expression != "": expression += " " expression += _formPhraseExpression(wordList) else: UT.unhandledCase() return self._process(expression, []) elif operator == "AND NOT": regexpList = [] wordList = self._parseConstraintText(constraints[0]) if constraints[0].getOperator() == "contains-any-words": expression = _formAnyExpression(wordList) elif constraints[0].getOperator() == "contains-all-words": expression = _formAllExpression(wordList) elif constraints[0].getOperator() == "contains-phrase": if self.regexpPhraseFilter: expression = _formAllExpression(wordList) regexpList += [_formRegexp(wordList)] else: expression = _formPhraseExpression(wordList) else: UT.unhandledCase() wordList = self._parseConstraintText(constraints[1]) if constraints[1].getOperator() == "contains-any-words": expression += " -(" + _formAnyExpression(wordList) + ")" elif constraints[1].getOperator() == "contains-all-words": # Surprisingly, the following seems to work: expression += " -(" + _formAllExpression(wordList) + ")" elif constraints[1].getOperator() == "contains-phrase": # In MySQL 4.1.0alpha, negation of phrases doesn't # work, ergo... return None else: UT.unhandledCase() return self._process(expression, regexpList) else: UT.unhandledCase() def _parseConstraintText (self, constraint): UT.assertType(constraint, M.Query.TextualConstraint) wordList = string.split(string.translate(constraint.getText(), self.mapping, self.deleteList)) if len(wordList) == 0: raise UT.QueryError, "no query words specified in constraint " +\ "on bucket '" + constraint.getBucket() + "'" return wordList def _process (self, expression, regexpList): UT.assertType(expression, types.StringType) UT.assertType(regexpList, types.ListType) UT.assertListElementType(regexpList, types.StringType) table = UT.TableRef(self.table) clause = ["MATCH ("] firstColumn = 1 for column in self.textColumns: if not firstColumn: clause += [", "] clause += [table, "." + column] firstColumn = 0 clause += [") AGAINST ('" + UT.protectQuotes(expression) +\ "' IN BOOLEAN MODE)"] clause = UT.Expression(clause) for regexp in regexpList: regexpClause = UT.Expression([]) for column in self.textColumns: regexpClause = regexpClause.combine("OR", UT.Expression([table, "." + column + " REGEXP '" +\ UT.protectQuotes(regexp) + "'"])) clause = clause.combine("AND", regexpClause) return UT.Select( [UT.MainFrom(table, self.idColumn, self.cardinality)], clause)