Numeric Expression Tokenizer (Python Code Lexical Analyzer)

This code breaks down a numeric expression into individual tokens (grammar words) and stores them in a Python list. The list is input to a program that evaluates the numeric expression and displayed the results or an error message.

#!/usr/bin/python3
# ===================================================================
# parse a numeric expression into tokens
#
# Last Modified: 10/15/2020
# ===================================================================

import copy
import sys


op1 = ['-', '+', '%', '*', '/', '(', ')' ]

op2 = [ '**' ]


# -------------------------------------------------------------------
# Parse a numeric expression and return the expression
# as a list of tokens (strings)
# -------------------------------------------------------------------

def parse(text):

    ##print('parse({})'.format(text))

    idx      = 0               # text string character index
    text_len = len(text)       # text string length
    tokens   = []              # list of tokens


    # ---------------------------------------------------------------
    # ---- local function: test for and return an operator
    # ---- (1 or 2 character operators)
    # ----
    # ---- idx     - current character index in text string
    # ---- txt     - text string
    # ---- txt_len - length of text string
    # ---------------------------------------------------------------

    def operator_token(idx, txt, txt_len):

        ##print('operator_token(idx={}, txt={}, txt_len={})'.
        ##     format(idx,txt,txt_len))

        # ---- text index out of bounds?

        if idx >= txt_len:
            return (False, idx, '')

        # ---- test for 2 character operator

        if idx < txt_len - 1:

            for op in op2:

                if txt[idx] == op[0] and txt[idx+1] == op[1]:

                    # ---- return 2 character operator and the
                    # ---- index of where to start parsing next

                    return (True, idx + 2, op)

        # ---- test for 1 character operator          

        if idx < txt_len:

            for op in op1:

                if txt[idx] == op:

                    # ---- return 1 character operator and the
                    # ---- index of where to start parsing next

                    return (True, idx + 1, op)

        # ---- it was not an operator

        return (False, idx, '')


    # ---------------------------------------------------------------
    # ---- local function: test for and return a number
    # ----
    # ---- idx     - current character index in text
    # ---- txt     - text string
    # ---- txt_len - length of text string
    # ---------------------------------------------------------------

    def number_token(idx, txt, txt_len):

        ##print('number_token(idx={}, txt={}, txt_len={})'.
        ##     format(idx,txt,txt_len))


        # ---------------------------------------------------------
        # ---- collect integer part of number, if any -------------
        # ---------------------------------------------------------

        def collect_integer(idx,txt,txt_len):

            token = ''
            digit_count = 0

            while idx < txt_len:
                if txt[idx].isdigit():
                    token += txt[idx]
                    idx += 1
                    digit_count += 1
                else:
                    break

            if digit_count < 1:
                return (False,idx,'')

            return (True,idx,token)


        # ---------------------------------------------------------
        # ---- collect fraction part of number, if any ------------
        # ---------------------------------------------------------

        def collect_fraction(idx,txt,txt_len):

            token = ''

            while idx < txt_len:
                if txt[idx].isdigit():
                    token += txt[idx]
                    idx += 1
                else:
                    break

            return (True,idx,token)


        # ---------------------------------------------------------
        # ---- collect exponent part of number, if any ------------
        # ---------------------------------------------------------

        def collect_exponent(idx,txt,txt_len):

            if idx >= txt_len:
                return (False,idx,'')

            token = ''
            digit_count = 0

            if txt[idx] == '-' or txt[idx] == '+':
                token += txt[idx]
                idx += 1

            while idx < txt_len:
                if txt[idx].isdigit():
                    token += txt[idx]
                    idx += 1
                    digit_count += 1
                else:
                    break

            if digit_count < 1:
                return (False,idx,'')

            return (True,idx,token)


        # ---- text index out of bounds?

        if idx >= txt_len:
            return (False, idx, '')

        # ---- collect number token characters ------------

        # ---- integer part

        token = ''

        (success,new_idx,itoken) = collect_integer(idx,txt,txt_len)

        if not success:
            return (False, idx, '')
  
        idx = new_idx

        token += itoken

        # ---- fraction part

        if idx >= txt_len:
            return (True,idx,token)

        if txt[idx] == '.':
            token += '.'
            idx += 1

        if txt[idx].isdigit():
            (success,new_idx,ftoken) = collect_fraction(idx,txt,txt_len)

            if not success:
                return (False, idx, '')

            idx = new_idx

            token += ftoken

        # ---- exponent part

        if idx >= txt_len:
            return (True,idx,token)

        if txt[idx] == 'e' or txt[idx] == 'E':
            token += txt[idx]
            idx += 1

            (success,new_idx,etoken) = collect_exponent(idx,txt,txt_len)

            if not success:
                return (False, idx, '')

            idx = new_idx

            token += etoken

        # ---- at the end of everything, is there any token?

        if len(token) == 0:
            return (False, idx, '')

        # ---- success

        return (True, idx, token)


    # ---------------------------------------------------------------
    # ---- main parse code ------------------------------------------
    # ---- parse the text string; and generate tokens ---------------
    # ---------------------------------------------------------------

    while idx < text_len:

        ##print('[{}] chr = {}'.format(idx,text[idx])

        # ---- is the current character a space?
        # ---- skip space

        if text[idx] == ' ':

            idx += 1

            continue

        # ---- is the current character a digit?

        if text[idx].isdigit():

            # ---- get a number token

            (success, new_idx, token) = number_token( idx, text, text_len)

            ##print('return from number_token')
            ##print('success={}, new_index={}, token={}'.
            ##     format(success,new_idx,token))

            if success:

                idx = new_idx

                tokens.append(token)

                continue

        # ---- is the current character an operator?

        if not text[idx].isdigit():

            # ---- get an operator token

            (success, new_idx, token) = operator_token(idx, text, text_len)

            ##print('return from operator_token')
            ##print('success={}, new_index={}, token={}'.
            ##     format(success,new_idx,token))

            if success:

                idx = new_idx

                tokens.append(token)

                continue

        # ---- parsing error

        print()
        print('Numeric Expression Parsing Error')
        print('Start of token char: {}'.format(text[idx]))
        print('Start of token idx : {}'.format(idx+1))
        print('Text string        : {}'.format(text))
        print('Text string length : {}'.format(text_len))
        return (False,tokens)


    return (True,tokens)


# -------------------------------------------------------------------
# main
# -------------------------------------------------------------------

if __name__ == '__main__':

    # ---------------------------------------------------------------
    # ----running Python3?
    # ---------------------------------------------------------------

    def RunningPython3():
        if sys.version_info[0] == 3:
            return True
        return False

    # ---------------------------------------------------------------
    # ---- prompt the user for input
    # ---------------------------------------------------------------

    def GetUserInput(prompt,py3):
        if py3:
            return input(prompt).strip()
        else:
            return raw_input(prompt).strip()

    # ---------------------------------------------------------------
    # ---- pause program
    # ---------------------------------------------------------------

    def Pause(py3):
        print('')
        GetUserInput('Press enter to continue ',py3)

    # ---------------------------------------------------------------
    # ---- process a numeric expression string
    # ---- display information
    # ---------------------------------------------------------------

    def process_expression(exp):

        print()
        print('-------------------------------------')
        print('expression: {}'.format(exp))

        (success,tlist) = parse(exp)

        if not success:
            print(tlist)
            sys.exit()

        print('Token list length is {}'.format(len(tlist)))
        for t in tlist:
            print(t)

        return success

    # ---- parse test expressions    
    #
    ##exp = [ '11 + 202 - 3003',
    ##        '1 + 2 * 3',
    ##        '(1 + 2) * 3',
    ##        '-10 + 2 * 3',
    ##        '-10 + (2 * 3)',
    ##        '(-10 + 2) * -3',
    ##        '-((2 * 3) - (4 * 8 -2))',
    ##        '2**3 -4 *2' ]
    ##
    #for e in exp:
    #    if not process_expression(e):
    #        sys.exit()

    # ---- parse user input (numeric expression)

    py3 = RunningPython3()

    while True:

        print()
        e = GetUserInput('Enter expression: ',py3)

        if not e:
            print()
            break

        process_expression(e)