Software Development: March 2009

Here's a class passed to me for escaping keywords and dealing with how to pass user input to Oracle's text indexing functions. In particular the CONTAINS operator.

// from oracle web site.
package foo;

import java.util.Vector;

/**
 * Escapes search phrases so they can be safely passed to Oracle's SQL
 * CONTAINS operator.
 * 
 * WARNING: it appears that this class is not thread safe and that
 * it maintains state between calls and thus a new object should be created
 * between each use rather than reusing the same object.
 */
public class QueryTranslator {

  // XXX make thread safe and change so that reqWords and notWords get cleared
  // out.  If that's done this we don't have to create a new instance of this
  // object each time we want to translate something.

  private Vector reqWords = new Vector();

  private Vector notWords = new Vector();

  public String translate(String input) {
    if (input.indexOf('{') > -1 || input.indexOf('}') > -1) {
      throw new IllegalArgumentException("'}' and '{' should not appear in input");
    }
    processString(input);
    if (this.reqWords.size() == 0) {
      throw new IllegalArgumentException("no 'required' words in query: " + input);
    }
    String translatedQuery = getQuery();
    if (translatedQuery.indexOf("()") > -1 
        || translatedQuery.indexOf("{}") > -1
        || translatedQuery.indexOf("\\}") > -1) {
      throw new IllegalArgumentException("can't construct a valid oracle text query from: " + input);
    }
    return translatedQuery;
  }

  private void addWord(final String word, final boolean isRequired) {

    if (isRequired) {
      this.reqWords.add(word);
    } else {
      this.notWords.add(word);
    }
  }

  public void processString(final String input) {
    int p = 0;
    int startWord;
    String theWord;

    this.reqWords = new Vector();
    this.notWords = new Vector();

    while (true) { // Loop over all words

      startWord = p;
      while (p < input.length() && input.charAt(p) != ' ') {
        // Check for quoted phrase
        if (input.charAt(p) == '"') { // Quote - skip to next or end
          p++; // skip the actual quote
          while (p < input.length() && input.charAt(p) != '"') {
            p++;
          }
          if (p < input.length()) {
            p++; // Skip the final quote if found
          }
        } else {
          p++;
        }
      }

      // Got a word. Check for required/not wanted flags (+-)

      theWord = input.substring(startWord, p);

      // CY bug 11825, don't process zero length string
      if (theWord.length() > 0) {
        // CY changed this to required from optional to make it AND
        // logic
        boolean isRequired = true;

        if (theWord.charAt(0) == '+' && theWord.length() > 1) {
          isRequired = true;
          theWord = theWord.substring(1);
        }

        else if (theWord.charAt(0) == '-' && theWord.length() > 1) {
          isRequired = false;
          theWord = theWord.substring(1);
        }

        // Replace * wild cards with %

        theWord = theWord.replace('*', '%');

        if (!"%".equals(theWord)) {
          addWord(theWord, isRequired);
        }

      }
      p++;
      if (p >= input.length()) {
        break;
      }
    }
  }

  // Get word gets a single word from the "words" vector,
  // surrounds it in braces (to avoid reserved words)
  // and attaches a WITHIN clause if appropriate.

  private String getWord(final Vector words, final int pos) {
    // here I added stuff for handling the wildcard, which doesn't work if
    // in {}
    String word = words.elementAt(pos);
    if (word.indexOf('%') > -1) {
      word =  word.replaceAll("[\\W&&[^%]]", "");
      if ("%".equals(word)) {
        return "";
      }
      return word;
    }
    if (word.lastIndexOf('\\') == word.length() - 1) {
      word = word.substring(0, word.length() - 1);
    }
    return "${".concat( word) + '}';
  }

  // getQuery returns a formatted, ready-to-run ConText query.
  // In order to satisfy the altavista syntax, we have to generate
  // the following query:

  // ( req1 & req2 & ... reqN)
  // | ( (req1 & req2 & .. reqN)*10*10
  // & (req1, req2 , ... reqN , opt1 , opt2 , ... optN) )
  // NOT (not1 | not2 | ... notN)

  public String getQuery() {
    StringBuffer sb = new StringBuffer();
    // String tempString = "";

    String boolOp = ""; // AND, OR, NOT operator
    int reqCount; // Count of required words
    int notCount; // Count of not wanted words
    int i; // Loop control

    boolOp = "";
    reqCount = this.reqWords.size();
    notCount = this.notWords.size();

    if (this.reqWords.size() > 0) {
      // Required words - first time

      sb.append("((");
      for (i = 0; i < reqCount; i++) {
        sb.append(boolOp).append(getWord(this.reqWords, i));
        boolOp = " & ";
      }
    }

    if (reqCount > 0) {
      sb.append(")) ");
    }

    if (notCount > 0) {
      boolOp = " NOT ";
    } else {
      boolOp = "";
    }

    for (i = 0; i < notCount; i++) {
      sb.append(boolOp).append(getWord(this.notWords, i));
      boolOp = " NOT ";
    }
    return sb.toString();
  }

  public static void main(String args[]) {

    if (args.length != 1) {
      System.out.println("java " + QueryTranslator.class.getName()
          + " search_phrase");
      System.exit(1);
    }

    System.out.println("Orginal Phrase:    " + args[0]);
    System.out.println("Translated Phrase: "
        + new QueryTranslator().translate(args[0]));
  }
}
Software Development

About Me

Tuesday, March 31, 2009

Oracle Text Indexing - escaping user input

Friday, March 27, 2009

Aggregation of Social Content

RequestDispatcher

Blog Archive