changed "labels" property to "tags". added method to inquire about (abstract) labels, i.e. interpreting the tag list to distill underlying label "meaning".

This commit is contained in:
Aritra Chakraborty 2016-05-16 19:54:00 -04:00
parent 8540748220
commit dc6260be58
1 changed files with 75 additions and 37 deletions

View File

@ -1,6 +1,5 @@
# -*- coding: UTF-8 no BOM -*- # -*- coding: UTF-8 no BOM -*-
import os,sys import os,sys
import numpy as np import numpy as np
@ -26,7 +25,7 @@ class ASCIItable():
self.__IO__ = {'output': [], self.__IO__ = {'output': [],
'buffered': buffered, 'buffered': buffered,
'labeled': labeled, # header contains labels 'labeled': labeled, # header contains labels
'labels': [], # labels according to file info 'tags': [], # labels according to file info
'readBuffer': [], # buffer to hold non-advancing reads 'readBuffer': [], # buffer to hold non-advancing reads
'dataStart': 0, 'dataStart': 0,
} }
@ -50,7 +49,7 @@ class ASCIItable():
self.__IO__['out'] = outname self.__IO__['out'] = outname
self.info = [] self.info = []
self.labels = [] self.tags = []
self.data = [] self.data = []
self.line = '' self.line = ''
@ -160,7 +159,7 @@ class ASCIItable():
if self.__IO__['labeled']: # table features labels if self.__IO__['labeled']: # table features labels
self.info = [self.__IO__['in'].readline().strip() for i in xrange(1,int(m.group(1)))] self.info = [self.__IO__['in'].readline().strip() for i in xrange(1,int(m.group(1)))]
self.labels = shlex.split(self.__IO__['in'].readline()) # store labels found in last line self.tags = shlex.split(self.__IO__['in'].readline()) # store tags found in last line
else: else:
@ -179,11 +178,11 @@ class ASCIItable():
else: break # last line of comments else: break # last line of comments
if self.__IO__['labeled']: # table features labels if self.__IO__['labeled']: # table features labels
self.labels = self.data # get labels from last line in "header"... self.tags = self.data # get tags from last line in "header"...
self.data_read() # ...and remove from buffer self.data_read() # ...and remove from buffer
if self.__IO__['labeled']: # table features labels if self.__IO__['labeled']: # table features tags
self.__IO__['labels'] = list(self.labels) # backup labels (make COPY, not link) self.__IO__['tags'] = list(self.tags) # backup tags (make COPY, not link)
try: try:
self.__IO__['dataStart'] = self.__IO__['in'].tell() # current file position is at start of data self.__IO__['dataStart'] = self.__IO__['in'].tell() # current file position is at start of data
@ -196,7 +195,7 @@ class ASCIItable():
"""write current header information (info + labels)""" """write current header information (info + labels)"""
head = ['{}\theader'.format(len(self.info)+self.__IO__['labeled'])] if header else [] head = ['{}\theader'.format(len(self.info)+self.__IO__['labeled'])] if header else []
head.append(self.info) head.append(self.info)
if self.__IO__['labeled']: head.append('\t'.join(map(self._quote,self.labels))) if self.__IO__['labeled']: head.append('\t'.join(map(self._quote,self.tags)))
return self.output_write(head) return self.output_write(head)
@ -260,19 +259,58 @@ class ASCIItable():
try: try:
for item in what: self.labels_append(item) for item in what: self.labels_append(item)
except: except:
self.labels += [self._removeCRLF(str(what))] self.tags += [self._removeCRLF(str(what))]
else: else:
self.labels += [self._removeCRLF(what)] self.tags += [self._removeCRLF(what)]
self.__IO__['labeled'] = True # switch on processing (in particular writing) of labels self.__IO__['labeled'] = True # switch on processing (in particular writing) of tags
if reset: self.__IO__['labels'] = list(self.labels) # subsequent data_read uses current labels as data size if reset: self.__IO__['tags'] = list(self.tags) # subsequent data_read uses current tags as data size
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def labels_clear(self): def labels_clear(self):
"""delete existing labels and switch to no labeling""" """delete existing labels and switch to no labeling"""
self.labels = [] self.tags = []
self.__IO__['labeled'] = False self.__IO__['labeled'] = False
# ------------------------------------------------------------------
def labels(self,
tags = None,
raw = False,
):
"""
returns abstract labels (e.g. "x" for "1_x","2_x",...)
unless raw output is requested.
operates on object tags or given list.
"""
from collections import Iterable
if tags is None: tags = self.tags
if isinstance(tags, Iterable) and not raw: # check whether list of tags is requested
id = 0
dim = 1
labelList = []
while id < len(tags):
if not tags[id].startswith('1_'):
labelList.append(tags[id])
else:
label = tags[id][2:] # get label
while id < len(tags) and tags[id] == '{}_{}'.format(dim,label): # check successors
id += 1 # next label...
dim += 1 # ...should be one higher dimension
LabelList.append(label) # reached end --> store
id -= 1 # rewind one to consider again
id += 1
dim = 1
else:
labelList = self.tags
return labelList
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def label_index(self, def label_index(self,
labels): labels):
@ -292,10 +330,10 @@ class ASCIItable():
idx.append(int(label)-1) # column given as integer number? idx.append(int(label)-1) # column given as integer number?
except ValueError: except ValueError:
try: try:
idx.append(self.labels.index(label)) # locate string in label list idx.append(self.tags.index(label)) # locate string in label list
except ValueError: except ValueError:
try: try:
idx.append(self.labels.index('1_'+label)) # locate '1_'+string in label list idx.append(self.tags.index('1_'+label)) # locate '1_'+string in label list
except ValueError: except ValueError:
idx.append(-1) # not found... idx.append(-1) # not found...
else: else:
@ -303,10 +341,10 @@ class ASCIItable():
idx = int(labels)-1 # offset for python array indexing idx = int(labels)-1 # offset for python array indexing
except ValueError: except ValueError:
try: try:
idx = self.labels.index(labels) idx = self.tags.index(labels)
except ValueError: except ValueError:
try: try:
idx = self.labels.index('1_'+labels) # locate '1_'+string in label list idx = self.tags.index('1_'+labels) # locate '1_'+string in label list
except ValueError: except ValueError:
idx = None if labels is None else -1 idx = None if labels is None else -1
@ -331,16 +369,16 @@ class ASCIItable():
try: # column given as number? try: # column given as number?
idx = int(label)-1 idx = int(label)-1
myDim = 1 # if found has at least dimension 1 myDim = 1 # if found has at least dimension 1
if self.labels[idx].startswith('1_'): # column has multidim indicator? if self.tags[idx].startswith('1_'): # column has multidim indicator?
while idx+myDim < len(self.labels) and self.labels[idx+myDim].startswith("%i_"%(myDim+1)): while idx+myDim < len(self.tags) and self.tags[idx+myDim].startswith("%i_"%(myDim+1)):
myDim += 1 # add while found myDim += 1 # add while found
except ValueError: # column has string label except ValueError: # column has string label
if label in self.labels: # can be directly found? if label in self.tags: # can be directly found?
myDim = 1 # scalar by definition myDim = 1 # scalar by definition
elif '1_'+label in self.labels: # look for first entry of possible multidim object elif '1_'+label in self.tags: # look for first entry of possible multidim object
idx = self.labels.index('1_'+label) # get starting column idx = self.tags.index('1_'+label) # get starting column
myDim = 1 # (at least) one-dimensional myDim = 1 # (at least) one-dimensional
while idx+myDim < len(self.labels) and self.labels[idx+myDim].startswith("%i_"%(myDim+1)): while idx+myDim < len(self.tags) and self.tags[idx+myDim].startswith("%i_"%(myDim+1)):
myDim += 1 # keep adding while going through object myDim += 1 # keep adding while going through object
dim.append(myDim) dim.append(myDim)
@ -350,16 +388,16 @@ class ASCIItable():
try: # column given as number? try: # column given as number?
idx = int(labels)-1 idx = int(labels)-1
dim = 1 # if found has at least dimension 1 dim = 1 # if found has at least dimension 1
if self.labels[idx].startswith('1_'): # column has multidim indicator? if self.tags[idx].startswith('1_'): # column has multidim indicator?
while idx+dim < len(self.labels) and self.labels[idx+dim].startswith("%i_"%(dim+1)): while idx+dim < len(self.tags) and self.tags[idx+dim].startswith("%i_"%(dim+1)):
dim += 1 # add as long as found dim += 1 # add as long as found
except ValueError: # column has string label except ValueError: # column has string label
if labels in self.labels: # can be directly found? if labels in self.tags: # can be directly found?
dim = 1 # scalar by definition dim = 1 # scalar by definition
elif '1_'+labels in self.labels: # look for first entry of possible multidim object elif '1_'+labels in self.tags: # look for first entry of possible multidim object
idx = self.labels.index('1_'+labels) # get starting column idx = self.tags.index('1_'+labels) # get starting column
dim = 1 # is (at least) one-dimensional dim = 1 # is (at least) one-dimensional
while idx+dim < len(self.labels) and self.labels[idx+dim].startswith("%i_"%(dim+1)): while idx+dim < len(self.tags) and self.tags[idx+dim].startswith("%i_"%(dim+1)):
dim += 1 # keep adding while going through object dim += 1 # keep adding while going through object
return np.array(dim) if isinstance(dim,Iterable) else dim return np.array(dim) if isinstance(dim,Iterable) else dim
@ -403,8 +441,8 @@ class ASCIItable():
def data_rewind(self): def data_rewind(self):
self.__IO__['in'].seek(self.__IO__['dataStart']) # position file to start of data section self.__IO__['in'].seek(self.__IO__['dataStart']) # position file to start of data section
self.__IO__['readBuffer'] = [] # delete any non-advancing data reads self.__IO__['readBuffer'] = [] # delete any non-advancing data reads
self.labels = list(self.__IO__['labels']) # restore label info found in header (as COPY, not link) self.tags = list(self.__IO__['tags']) # restore label info found in header (as COPY, not link)
self.__IO__['labeled'] = len(self.labels) > 0 self.__IO__['labeled'] = len(self.tags) > 0
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def data_skipLines(self, def data_skipLines(self,
@ -431,8 +469,8 @@ class ASCIItable():
self.line = self.line.rstrip('\n') self.line = self.line.rstrip('\n')
if self.__IO__['labeled'] and respectLabels: # if table has labels if self.__IO__['labeled'] and respectLabels: # if table has labels
items = shlex.split(self.line)[:len(self.__IO__['labels'])] # use up to label count (from original file info) items = shlex.split(self.line)[:len(self.__IO__['tags'])] # use up to label count (from original file info)
self.data = items if len(items) == len(self.__IO__['labels']) else [] # take entries if label count matches self.data = items if len(items) == len(self.__IO__['tags']) else [] # take entries if label count matches
else: else:
self.data = shlex.split(self.line) # otherwise take all self.data = shlex.split(self.line) # otherwise take all
@ -469,7 +507,7 @@ class ASCIItable():
1)) 1))
use = np.array(columns) use = np.array(columns)
self.labels = list(np.array(self.labels)[use]) # update labels with valid subset self.tags = list(np.array(self.tags)[use]) # update labels with valid subset
self.data = np.loadtxt(self.__IO__['in'],usecols=use,ndmin=2) self.data = np.loadtxt(self.__IO__['in'],usecols=use,ndmin=2)