Friday, December 28, 2018

Tokenizer and quoted strings

Quoted strings are de-escaped in place. The leading quote is left attached, though this might change.  That is currently the default format.  All variables names and string constants are null terminated, in the current address space by tokenizer. Just going over quote rules a bit looking at the code. Defaul always assumed he leading quote remained.  Most application prefer this, it identifies the current null terminated string as possibly having spaces,commas, quotes, whatever, a nice warning to have.

Tokenizer remains bloated though I keep working it better. Bu here is where we want to invest effort, here is what attracts authors of snippets, the idea that their arguments come neatly arranged in an array with pointer. I would like to hear what the iPythoin folks think, they deal with exactly the same issue. But a solid, conrollable tokenizer, up front, that is ideal for snippets, a real productivity tool.

I dropped the latest version below the fold.

The Hardly plan remains, a full 64 bit high speed stack machine available as a byte code on the bus. It executes stack operations right from args list, built for that. It has full stack rate, stack based objects, and built in, configurable,  shunt for converting your emperitives  to rpn notation.  Default is 95% though testing, the syntax not operational, console loop is fine.  The plan is to connect up Default an Xchars fr final testing of both.



#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

typedef struct {
char * Data[100];
} Base;
typedef Base * PBase;
Base DefaultBase; // One and only one args list
int ArgcBase=0; // Current argc
int TokeBase=0; // the original tokens we identified, earlier on the Busx
int TokeEnd=0; // Endof token list
int StartCommand=0; // Tracks the start of Bus pass through com
int DepthBase=0; // Recursion depth
enum{NOTFOUND,SUCCESS};

enum{FileNamePuncts=0,DefNamePunct,NamePuncts,EolPuncts,
ExprPuncts,OpPuncts,SeparPuncts,NamePuncts,QuotePuncts};
char * puncts[QuotePuncts];
// Defaults
void set_default_puncts() {
puncts[FilePuncts]="/_.-";
puncts[DefNamePuncts]= "-_.";
puncts[NamePuncts]= "_-.";
puncts[EolPuncts]= ";\n";
puncts[ExprPuncts]= "(=";
puncts[OpPuncts]= ";{}()*/=+-^&";
puncts[SeparPuncts]="X";
puncts[QuotePuncts]= "\'\"`";
}
#define op_puncts puncts[FilePuncts]

// using # for comments and semi colons
// fill in the bottom of args with token pointers


#define is_op(a) strchr(puncts[OpPuncts],a)
#define is_pair(a) strchr(puncts[PairPuncts],a)
#define is_eol(a) strchr(puncts[EolPuncts],a)
#define is_expr(a) strchr(puncts[ExprPuncts],a)
#define is_separ(a) (puncts[SeparPuncts] && strchr(puncts[SeparPuncts],a))
#define is_name(a) (isalnum(a) || (a && strchr(puncts[NamePuncts],a)) )
#define is_quote(a) strchr(puncts[QuotePuncts],a)
// file or expression syntax
enum{ start_m, expr_m,eol_m,pass_m};
int mode;
char comment_char ='#';
char *  flush_line(char * a) {
*a=0;
do a++; while(*a && (*a != '\n'));
return(a);
}
char * flush_name(char * a) {
char * b;
while(is_name(*a)) {
b = strchr(puncts[NamePuncts],*a);
a++;
}
return(a);
 }

char * flush_quote(char * dest,char * src)  {
char * esc;
while(( !is_quote(*a)) {
if(*ch != '\\') {
*dest=*src;
src++;
} else {
if(!(esc = strchr(escapes,*src)){
*dest=*src;
src++;
} else {
*des=*esc;
src+=2;
}
dest++;
}
// This will not handle ++ correct, yet
#define eol_m 1
#define expr_m 0
char * flush_separs(char * a){
while(isspace(*a)  || is_separ(*a) ) a++;
return(a);}
int argtoks(char * src) {
PBase p = &DefaultBase;
char * op=0;
char * end=0;
char  ch=0;
mode = start_m;
while(*src) {
src=flush_separs(src);  // find next token start
if(!*src) { if(ch) *end=0; break;} // Null terminate last token
// look for mode change, like a flip flop
if(!(mode==expr_m) && is_expr(*src)) {
puncts[OpPuncts] = puncts[ExprPuncts];
puncts[NamePuncts] = puncts[DefNamePuncts];
mode=expr_m;
}
else if(!(mode = eol_m) &&  is_eol(*src)) {
puncts[OpPuncts] = puncts[FilePuncts];
puncts[NamePuncts] = puncts[DefNamePuncts];
mode=eol_m;
}
if(*src == comment_char) src = flush_line(src);
else {
op = strchr(puncts[OpPuncts], *src); // Is it a special?
if(op) { // operator to be tokenized
p->Data[TokeEnd]=op;  // point to stored copy, not source
if(ch && !is_op(ch) ) *end=0;
if( is_pair(*src) ) flush_quote(src); // quotes, likely needs escapin
ch=*op; //
src++;
} else { // a name like a variable
p->Data[TokeEnd]= src;
if(ch && !is_op(ch)) *end=0;
src = flush_name(src);
ch= *(src-1); // save previous ending char
}
end=src;
TokeEnd++;
}
}
return(SUCCESS);
}

int SetConsole(int * argc,void* args[]){
int i =*argc +1;
int j =0;
int mask=atoi((char *) args[i]);
while(mask) {
if(1 & (mask >> 1)) {
puncts[j]= args[i];
i++;
}
j++;
}
return(SUCCESS);
}
int main(void) {
char  src[30];
memset(src,0,30);
strcpy(src,"Load /home/so/xchars");
set_default_puncts();
argtoks(src);
}
/*
\a 07 Alert (Beep, Bell) (added in C89)[1]
\b 08 Backspace
\f 0C Formfeed Page Break
\n 0A Newline (Line Feed); see notes below
\r 0D Carriage Return
\t 09 Horizontal Tab
\v 0B Vertical Tab
\\ 5C Backslash
\' 27 Single quotation mark
\" 22 Double quotation mark
\? 3F Question mark (used to avoid trigraphs)
\nnnnote 1 any The byte whose numerical value is given by nnn interpreted as an octal number
\xhh… any The byte whose numerical value is given by hh… interpreted as a hexadecimal number
\enote 2 1B escape character (some character sets)
\Uhhhhhhhhnote 3 none Unicode code point where h is a hexadecimal digit
\uhhhhnote 4 none
*/

No comments: