-- (C) Copyright International Business Machines Corporation 23 January 
-- 1990.  All Rights Reserved. 
--  
-- See the file USERAGREEMENT distributed with this software for full 
-- terms and conditions of use. 
-- File: tokenizer.d
-- Author: Andy Lowry
-- SCCS Info: @(#)tokenize.d	1.2 3/13/90

-- Interfaces and other types for the tokenizer tool

tokenize: using (common)
definitions
  
  -- TokenizeInit creates and initializes a tokenizer process,
  -- specifying character classes that determine the way strings are
  -- tokenized
  tokenizeInit: callmessage (
    wordChars: charstring,	-- characters that can appear in word tokens
    whiteChars: charstring,	-- characters that are skipped between tokens
    quoteChars: charstring,	-- characters that can delimit quoted strings
    bracketChars: charstring,	-- character pairs that can delimit
				-- bracketed strings (must be of even
				-- length) 
    tokenize: tokenizeFn	-- capability to the tokenizer
  )
  constant (wordChars, whiteChars, quoteChars, bracketChars)
  exit {full}
  exception badClasses {	-- something wrong with given char classes
    init(wordChars), init(whiteChars), init(quoteChars),init(bracketChars)};
  tokenizeInitQ: inport of tokenizeInit {init(wordChars),
    init(whiteChars), init(quoteChars), init(bracketChars)};
  tokenizeInitFn: outport of tokenizeInitQ;
  
  -- Tokenize is the tokenizer... it accepts a string and returns a
  -- list of tokens parsed from it
  tokenize: callmessage (
    string: charString,		-- chars to be tokenized
    tokens: tokenList		-- parsed tokens
  )
  constant (string)
  exit {full}
  exception IllFormed {init(string)};
  tokenizeQ: inport of tokenize {init(string)};
  tokenizeFn: outport of tokenizeQ;
  
  -- TokenStrings converts a list of tokens to a list of charstrings
  -- which are the bare strings represented by the tokens, with quote
  -- and bracket characters removed.  (If you want the quote and/or
  -- bracket characters retained as separate characters, specify them
  -- as delimiter characters (i.e. not in any of the other character
  -- classes) in the tokenizeInit call).
  tokenStrings: callmessage (
    tokens: tokenList,		-- the parsed tokens
    strings: charStringList	-- their string values
  )
  constant (tokens)
  exit {full};
  tokenStringsQ: inport of tokenStrings {init(tokens)};
  tokenStringsFn: outport of tokenStringsQ;
  
  -- Tokens come in four varieties: words (made up of chars in the
  -- wordChars class), delimiters (single characters not in any of the
  -- character classes), quoted strings (arbitrary text beginning and
  -- ending with the same character from the quoteChars class... the
  -- quote character can be embedded by doubling it), and bracketed
  -- strings (arbitrary text beginning with an opening bracket and
  -- ending with the corresponding closing bracket, and balanced
  -- within with respect to brackets and quotes)
  tokenType: enumeration (
    'word', 'delimiter', 'quotedString', 'bracketedString');
  token: variant of tokenType (
    'word' -> word: charString {init},
    'delimiter' -> delimiter: char {init},
    'quotedString' -> quoted: quotedStringToken {full},
    'bracketedString' -> bracketed: bracketedStringToken {full}
  );
  quotedStringToken: record (
    quoteChar: char,
    string: charString		-- without opening and closing quotes 
  );
  bracketedStringToken: record (
    openBracket: char,
    closeBracket: char,
    string: charString		-- without opening and closing brackets
  );
  tokenList: ordered table of token {full};
  
end definitions

