############################################################################### ## ## ## ALEXANDRIA DIGITAL LIBRARY ## ## University of California at Santa Barbara ## ## ## ## ------------------------------------------------------------------------- ## ## ## ## Copyright (c) 2008 by the Regents of the University of California ## ## All rights reserved ## ## ## ## Redistribution and use in source and binary forms, with or without ## ## modification, are permitted provided that the following conditions are ## ## met: ## ## ## ## 1. Redistributions of source code must retain the above copyright ## ## notice, this list of conditions, and the following disclaimer. ## ## ## ## 2. Redistributions in binary form must reproduce the above copyright ## ## notice, this list of conditions, and the following disclaimer in ## ## the documentation and/or other materials provided with the ## ## distribution. ## ## ## ## 3. All advertising materials mentioning features or use of this ## ## software must display the following acknowledgement: This product ## ## includes software developed by the Alexandria Digital Library, ## ## University of California at Santa Barbara, and its contributors. ## ## ## ## THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS" AND ANY ## ## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ## ## WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE ## ## DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ## ## ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ## ## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ## ## OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ## ## HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ## ## STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ## ## ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ## ## POSSIBILITY OF SUCH DAMAGE. ## ## ## ############################################################################### # $Header: /export/home/gjanee/bucket99/paradigms/RCS/Textual_OracleText.py,v 1.1 2008/05/19 20:11:57 gjanee Exp $ # SYNOPSIS # # Textual_OracleText (table, idColumn, textColumn, indexType, # cardinality, # mapping=TextUtils.mappings.nonAlphanumericToWhitespace, # deleteList=TextUtils.deleteLists.keepAll) # # table # A table to query, e.g., "holding". # # idColumn # The table's identifier column (i.e., the column to be # selected), e.g., "holding_id". # # textColumn # The table column to query, e.g., "subject_text". # # indexType # The type of index on 'textColumn'. Must be either # "CONTEXT" or "CTXCAT". # # cardinality # A Cardinality object representing the cardinality of # 'table' with respect to 'textColumn'. # # mapping # A Python character mapping table (i.e., a string of # length 256, indexed by ASCII character code) to process # constraint text with. Defaults to # 'nonAlphanumericToWhitespace', which maps # non-alphanumeric characters to whitespace (i.e., to word # separators). # # deleteList # A string of zero or more characters to delete from # constraint text. The default is the empty string, which # keeps all characters. # # DESCRIPTION # # Translates a textual constraint to an Oracle Text constraint of # the form (for a CONTEXT index) # # SELECT idColumn FROM table # WHERE CONTAINS(textColumn, 'expression', 1) > 0 # # or (for a CTXCAT index) # # SELECT idColumn FROM table # WHERE CATSEARCH(textColumn, 'expression', NULL) > 0 # # where 'expression' is the constraint text processed as follows: # 1) any characters that appear in 'deleteList' are deleted; 2) # the remaining characters are mapped using 'mapping'; 3) words # within the resulting constraint text are extracted using maximal # sequences of whitespace characters as word separators; 4) words # are escaped appropriately and recombined according to the index # type and constraint operator. For example, for index type # CONTEXT, constraint # # # ... # contains-all-words # Elmer J. Fudd # # # results in expression # # '{Elmer} & {J} & {Fudd}' # # For more information, see: Oracle Text Application Developer's # Guide, version 10g Release 2 (June 2005), # , and Oracle Text Reference # . # # This paradigm assumes that the text processing specified by # 'mapping' and 'deleteList' is compatible with Oracle's notion of # words (specifically, with the lexer used by the Oracle text # index), which it is, by default. # # The semantics of the "contains-all-words" operator will # generally be correct only if the cardinality is "1" or "1?". If # the cardinality is "0+" or "1+", wrap this paradigm in an # Adaptor_IndivisibleConcatenation paradigm. # # Exceptions thrown: # # no query words specified # # AUTHOR # # Greg Janee # gjanee@alexandria.ucsb.edu # # HISTORY # # $Log: Textual_OracleText.py,v $ # Revision 1.1 2008/05/19 20:11:57 gjanee # Initial revision # import string import types import edu.ucsb.adl.middleware M = edu.ucsb.adl.middleware import UniversalTranslator UT = UniversalTranslator import paradigms P = paradigms _separator = { "CONTEXT" : { "contains-any-words" : " | ", "contains-all-words" : " & ", "contains-phrase" : " " }, "CTXCAT" : { "contains-any-words" : " | ", "contains-all-words" : " ", "contains-phrase" : " " }} def _escapeWord (word): return "{" + string.replace(word, "}", "}}") + "}" class Textual_OracleText (UT.Paradigm): def __init__ (self, table, idColumn, textColumn, indexType, cardinality, mapping=P.TextUtils.mappings.nonAlphanumericToWhitespace, deleteList=P.TextUtils.deleteLists.keepAll): UT.assertType(table, types.StringType) self.table = table UT.assertType(idColumn, types.StringType) self.idColumn = idColumn UT.assertType(textColumn, types.StringType) self.textColumn = textColumn UT.assertType(indexType, types.StringType) assert indexType in ["CONTEXT", "CTXCAT"], "unrecognized Oracle Text " +\ "index type: " + indexType self.indexType = indexType UT.assertType(cardinality, UT.Cardinality) self.cardinality = cardinality UT.assertType(mapping, types.StringType) assert len(mapping) == 256, "character mapping table has length " +\ str(len(mapping)) + ", should be 256" self.mapping = mapping UT.assertType(deleteList, types.StringType) self.deleteList = deleteList def translateBucketAtomic (self, constraint, vocabularies): UT.assertType(constraint, M.Query.TextualConstraint) assert constraint.getOperator() in UT.standardTextualOperators,\ "unsupported operator: " + constraint.getOperator() wordList = string.split(string.translate(constraint.getText(), self.mapping, self.deleteList)) if len(wordList) == 0: raise UT.QueryError, "no query words specified in constraint " +\ "on bucket '" + constraint.getBucket() + "'" expression = _separator[self.indexType][constraint.getOperator()].join( [_escapeWord(w) for w in wordList]) if self.indexType == "CTXCAT" and\ constraint.getOperator() == "contains-phrase": expression = "\"" + expression + "\"" table = UT.TableRef(self.table) if self.indexType == "CONTEXT": e = UT.Expression(["CONTAINS(", table, "." + self.textColumn + ", '" +\ string.replace(expression, "'", "''") + "', 1) > 0"]) else: e = UT.Expression(["CATSEARCH(", table, "." + self.textColumn + ", '" +\ string.replace(expression, "'", "''") + "', NULL) > 0"]) return UT.Select([UT.MainFrom(table, self.idColumn, self.cardinality)], e)