<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">import java.util.regex.Pattern;
import java.util.regex.Matcher;

import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;

import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
    
/**
 * A straightforward scanner for CS 212.  Uses regular expressions to define tokens.
 * &lt;p&gt;
 * Methods defineToken() and defineIgnored() are used to determine what the scanner 
 * considers to be a token and what should be ignored (usually you want to ignore 
 * whitespace and comments).  Method defineKeywords() lets you declare some tokens 
 * to be keywords.  
 * &lt;p&gt;
 * Each token is a particular 'kind' of token.  A kind is always a Java character.  
 * Two kinds are predefined: keyword tokens use token-kind 'K'; the EOF token is 
 * reported as null with token-kind '\0'.
 * &lt;p&gt;
 * There are methods to report (1) the current token [token()], (2) the current kind
 * [kind()], and (3) the current line number [lineNumber()].  The result returned by 
 * token() is a String, exactly as the token appears in the input file.
 * &lt;p&gt;
 * Looking at a token (via token()) does not change the current token.  Method advance() 
 * is used to pass over the current token and go on to the next one.  An initial call to 
 * advance() is needed to get to the first token.
 * &lt;p&gt;
 * The source for tokens is determined by setSource().  It can use either a Reader or
 * a String.  The String source is useful for debugging.
 * 
 * @author Paul Chew for CS212 (Sep 2007)
 */
public class Scanner212 {
    
    private List&lt;Pattern&gt; tokens;      // Token patterns (order is significant)
    private List&lt;Character&gt; kinds;     // Corresponding kinds
    private Set&lt;String&gt; keywords;      // Keyword tokens
    private String ignoringString;     // Pattern string of stuff to ignore
    private Pattern ignoring;          // Pattern of stuff to ignore
    private BufferedReader inFile;     // Input file
    private int lineCount;             // Current line number in file
    private String line;               // Current line
    private String token;              // Current token (as in input)
    private char kind;                 // Current kind (from defineToken)
    
    /**
     */
    public Scanner212 () {
        tokens = new ArrayList&lt;Pattern&gt;();
        kinds = new ArrayList&lt;Character&gt;();
        keywords = new HashSet&lt;String&gt;();
        ignoringString = "";
        ignoring = Pattern.compile(ignoringString);
        lineCount = -1;
        line = "";
        token = "Use scanner.advance()";
        kind = '\0';
    }
    
    /**
     * Specify the file to be used as token source.
     * @param file the input file
     */
    public void setSource (Reader file) {
        inFile = new BufferedReader(file);
    }
    
    /**
     * Specify a String to use as token source.  Useful for testing and debugging.
     * @param string the input string
     */
    public void setSource (String string) {
        inFile = new BufferedReader(new StringReader(string));
    }
    
    /**
     * Define a new pattern to be ignored.
     * Ignored patterns are typically those for white-space and comments.
     * @param patternString a pattern string as defined in class Pattern.
     */
    public void defineIgnored (String patternString) {
        if (ignoringString.length() != 0) ignoringString += "|";
        ignoringString += "(" + patternString + ")";
        ignoring = Pattern.compile(ignoringString);
    }
    
    /**
     * Define a new token (of specified kind) using the regular exp in pattern string.
     * Note that order is important.  For instance, if the pattern for an integer is entered
     * before the pattern for a floating point number then a number such as 15.3 will be
     * interpreted as "15" followed by ".3".  The kinds '\0' and 'K' are used for EOF
     * and keywords, respectively.
     * @param patternString a pattern string as defined in class Pattern.
     * @param kind a character used to indicate this kind of token.
     */
    public void defineToken (String patternString, char kind) {
        if (kind == '\0' || kind == 'K')
            System.err.println("Warning: Kinds '\\0' and 'K' are already in use.");
        tokens.add(Pattern.compile(patternString));
        kinds.add(kind);
    }
    
    /**
     * Define keywords.  Keywords use the kind 'K'.
     * @param keywords an arbitrary number of keywords
     */
    public void defineKeywords (String... keywords) {
        for (String k: keywords) this.keywords.add(k);
    }
    
    /**
     * Current line number.
     * @return the current line number
     */
    public int lineNumber () {
        return lineCount;
    }
    
    /**
     * Report the token's string exactly as it appears in the input.
     * @return the current token string
     */
    public String token () {
        return token;
    }
    
    /**
     * Report the kind of the curren token.
     * @return the kind of the current token
     */
    public char kind () {
        return kind;
    }
    
    /**
     * True iff the current token matches the input string.
     * @return true iff the current token matches the input string
     */
    public boolean matches (String input) {
        return token.equals(input);
    }
    
    /**
     * Integer value of the current token.
     * @return the integer value of the current token
     * @throws NumberFormatException
     */
    public int intValue () throws NumberFormatException {
        return Integer.valueOf(token);
    }
    
    /**
     * Float value of the current token.
     * @return the float value of the current token
     * @throws NumberFormatException
     */
    public float floatValue () throws NumberFormatException {
        return Float.valueOf(token);
    }
    
    /**
     * Advance by one token.
     * @throws RuntimeException if the source cannot be read
     * @throws IllegalArgumentException if non-token in source (token def is bad)
     */
    public void advance () throws RuntimeException, IllegalArgumentException {
        while (true) {
            // Get a new line of input, if necessary
            if (line.length() == 0) {
                try {line = inFile.readLine();}
                catch (IOException e) {
                    throw new RuntimeException("Cannot read source file", e);
                }
                if (line == null) {      // EOF reached
                    token = null;
                    kind = '\0';
                    return;
                }
                lineCount++;
                continue;
            }
            // Check for stuff that is supposed to be ignored
            int index = find(ignoring, line);
            if (index &gt; 0) {
                line = line.substring(index);
                continue;
            }
            // Check for a token
            for (int i = 0; i &lt; tokens.size(); i++) {
                Pattern pattern = tokens.get(i);
                index = find(pattern, line);
                if (index == 0) continue;
                token = line.substring(0, index);
                line = line.substring(index);
                if (keywords.contains(token)) kind = 'K';
                else kind = kinds.get(i);
                return;
            }
            // Can only reach this part if the token definitions are bad
            throw new IllegalArgumentException("No token found: " + line + 
                                               "\nConsider adding '.' as a token");
        }
    }
    
    /**
     * Find the given pattern at the front of the text.
     * @param pattern the pattern to look for
     * @param text the text to look in
     * @return the length of the matching portion of the text (0 if no match)
     */
    public static int find (Pattern pattern, String text) {
        Matcher m = pattern.matcher(text);
        if (m.lookingAt()) return m.end();
        return 0;
    }
    
    /**
     * Find the given pattern at the front of the text.
     * This version isn't used as part of the scanner, but it's handy to test
     * how regular expressions work.
     * @param pattern a string built according to the rules in java.util.regex.Pattern
     * @param text the text to look in
     * @return the length of the matching portion of the text (0 if no match)
     */
    public static int find (String pattern, String text) {
        return find(Pattern.compile(pattern), text);
    }
    
    /**
     * Set up a scanner for Bali (CS 212, Fall 2007).
     */
    public static Scanner212 baliScanner () {
        Scanner212 s = new Scanner212();
        s.defineIgnored("\\s+");                     // Ignore whitespace
        s.defineIgnored("#.*$");                     // Ignore comments (which start with #)
        s.defineToken("[a-zA-Z][a-zA-Z0-9]*", 'w');  // Word
        String exp = "([eE][-+]?[0-9]+)";            // Exponent part of a float
        s.defineToken("[0-9]+"+exp, 'f');            // Float with exponent and no decimal
        s.defineToken("((\\.[0-9]+)|([0-9]+\\.[0-9]*))"+exp+"?", 'f');
                                                     // Float with decimal and optional exp
        s.defineToken("[0-9]+", 'i');                // Unsigned integer
        s.defineToken("'.'", 'c');                   // Character
        s.defineToken("(\".*?\")+", 's');            // String ("" used for " within string)
        s.defineToken("\".*$", 'X');                 // Badly formed string (no close-quote)
        s.defineToken("(&lt;=|&gt;=|==|!=|.)", 'd');       // Delimiter (. matches anything)
        s.defineKeywords("end", "void",
                         "class", "extends", "endclass", "this", "super", "null",
                         "if", "then", "else", "endif", 
                         "loop", "while", "until", "endloop",
                         "return", "print",
                         "and", "or", "not", "true", "false");
        return s;
    }
    
    /**
     * Test program.
     */
    public static void main (String[] args) {
        Scanner212 s = Scanner212.baliScanner();
        s.setSource("   hello 3e-5 .3 19.71 -18 0 \n" +
                    "12345 'x' \"string\", &gt;= &lt; .. if \"another string\"   \n" +
                    "    false \"x\"\"y\"# Comment \n" +
                    "6.02E+23  6x  # Another comment\n" + 
                    "\"string with unpaired quote");
        for (s.advance(); s.kind() != '\0'; s.advance())
            System.out.println(s.lineNumber() + ": " + s.kind() + "  " + s.token());
    }
}</pre></body></html>