#! /usr/bin/python3
# ===================================================================
# parse a string containing CSVs and return a list
# containing the values
#
# Note: This is based on my Perl CSV parser.
# ===================================================================
# Note:
# 1. Some data may have values that are not surrounded by
# quotes but have imbedded quotes in them. This does not
# follow the rules for normal CSV strings and makes
# the parser more complex. None of the existing parsers
# I found were able to handle this kind of data.
# 2. Imbedded quotes (" or ') in a value are returned as part of
# the value.
# 3. The parser recognizes the following value formats:
# "ab"c" , returns (ab"c)
# "ab'c" <EOS> returns (ab'c)
# 'ab,c' , returns (ab,c)
# 'a bc' <EOS> returns (a bc)
# ab c , returns (ab c)
# ab c <EOS> returns (ab c)
# 4. The comma is a separator, not a terminator. The string "a,"
# returns two values (strings), an "a" and an empty string.
# (EOS terminates the last string.)
# 5. The \n must be removed from a string before parsing it.
# (Lines read fron a files include \n characters.)
# 6. Values are returned with leading and trailing spaces
# removed.
# 7. I coded this the hard (long) way. I did not use a
# convoluted, complex regular expression. I think it is
# easier to see what is going on this way.
# ===================================================================
# Something to test: instead of using string slice, use/add group(2)
# in the regular expressions. which is more
# efficient (faster)?
# ===================================================================
import re
# ---- compile regexp patterns (non-greedy matching)
p1 = re.compile(r'^\s*"(.*?)"\s*,')
p2 = re.compile(r'^\s*"(.*?)"\s*$')
p3 = re.compile(r"^\s*'(.*?)'\s*,")
p4 = re.compile(r"^\s*'(.*?)'\s*$")
p5 = re.compile(f'^\s*(["\'])[^\1]*$')
p6 = re.compile(r'^\s*(.*?)\s*,')
p7 = re.compile(r'^\s*(.*)\s*$')
# -------------------------------------------------------------------
# ---- parse CSV string, return a list of values
# -------------------------------------------------------------------
def parse_csv(csvstr):
# ---- is there a string to parse?
if len(csvstr) < 1:
##print('empty CSV string')
return []
lst = []
# ---------------------------------------------------------------
# ---- helper function
# ---- process a matched CSV at the beginning of the CSV string
# ---- 1. add the matched CSV to the list of values
# ---- 2. return the remainder of string (matched CSV removed)
# ---------------------------------------------------------------
def csv(m):
mgroup = m.group(1)
mstart = m.start()
mend = m.end()
##print(f'm.start = {mstart}')
##print(f'm.end = {mend}')
##print(f'm.group(1) = {mgroup}')
lst.append(mgroup)
return csvstr[mend:]
# ---------- loop thru the CSV string
while(True):
##print('-------------------------------------')
##print(f'CSV string = {csvstr}')
# ---- match any of the CSV patterns?
m = p1.match(csvstr)
if m:
csvstr = csv(m)
continue
m = p2.match(csvstr)
if m:
csv(m)
break
m = p3.match(csvstr)
if m:
csvstr = csv(m)
continue
m = p4.match(csvstr)
if m:
csv(m)
break
m = p5.match(csvstr)
if m:
lst = []
break
m = p6.match(csvstr)
if m:
csvstr = csv(m)
continue
m = p7.match(csvstr)
if m:
csv(m)
break
lst = []
break
return lst
# -------------------------------------------------------------------
# ---- main
# -------------------------------------------------------------------
if __name__ == '__main__':
import user_interface as ui
if not ui.running_python3():
print('end program - not running Python3')
quit()
while True: # loop
ui.clear_screen()
print()
s = ui.get_user_input('Enter CSV string: ')
if not s: # empty string?
break
if s == 'empty': # test an empty string
s = ''
lst = parse_csv(s)
print()
print('---- end of CSV parse ----')
print(f'CSV string is {s}')
print(f'CSV list length is {len(lst)}')
if len(lst) < 1:
print('CSV list is empty')
else:
print('CSV list:')
for s in lst:
print(f'-> ({s})')
ui.pause()