Web text scraper, dll option with improved io, relative indexing

Complete web page text scrapeR.  Add this link for Tags.c.



 /////////////////////





/////////////////////////////////////////////////////////////////////
///  Version 1.0, production alpha
// this code will not change much but some thinning is still required.
// Tis code creates pointers back to the original html source, 
// called the tag index stack.f
// As an index, the stack allows join to treat the original html
// as a graph database with skip and step functionality on plain text
//
// use gcc scrape.c // standalone, need to enable the main
//
// gcc -g -o scrape.dll -s -shared scrape.c -Wl,--subsystem,windows // dll
// name a simple html file test.html it produces test.hdx, 
//   and index on the source.
// 
//  Contact Matt Young youngsanger@gmail.com for information
/////////////////////////////////////////////////////////////////////
#include 
#include 
#include 
#include 
#include "tags.h"
#define BUFSIZE 10// Egad! Need dynamic allocation


// A structure for quick and easy buffering 
// of input from large files
typedef struct {
 char* buf;
 FILE* fin;
 int bufsize; 
 int offset; // matching buffer to source
 int count;
 } Into; 
typedef Into * PInto;
int msg_exit(PInto io,char * msg) {
 if(msg)
  printf("%s\n",msg); 
 io->count=0;  
 return(-1);
 }
 // Like the more function on a blog skip break
int  More(PInto io,char * msg) {
 if(io->count != io->bufsize)   return(msg_exit(io,msg));// Still not done
 io->count = fread(io->buf,1,io->bufsize,io->fin);
 if(io->count == EOF || (io->count <=0)) return(msg_exit(io,msg));
 io->offset = io->offset + io->bufsize; // the previous buffer was full
 io->buf[io->count]=0;
 //printf("%d bytes read start \n",io->count);
 return(0); // start index
}
int bump(int p,PInto q,char * m){
 p++;
 if(!q->buf[p])
  return(More(q,m));
 else
  return(p);
}
// All comments dropped
//

Tag TagDoneTag = {"TagsTagDone",TagDone};  // Special end of stack tag
//
// All the spaghetti needed to find the tags, their type and symbol here
// works directly off the source buffer which is never touched
// Get the next tag from a stream of html
//  ptr is an integer array index into the source
#define ch(p) q->buf[ptr]
int get_tag(Tag * var,int ptr, PInto q) {
 char * text = var->name; 
 *text=0;
 var->code=TagNull;
 while( ch(ptr) != '<') {
  //printf("%c",ch(ptr)); 
  if((ptr =bump(ptr,q,"EOF")) < 0) return(-1);
 }
 if((ptr=bump(ptr,q,"Open error")) < 0) return(-1); 
 if(ch(ptr)== '!') { // first char in side bracket
  var->code= TagSingle;
  var->name[0]=0;  // comments have no name
  while(ch(ptr) != '>') 
   if((ptr=bump(ptr,q,"Comment error")) < 0) return(-1) ;
  if((ptr=bump(ptr,q,"Comment error")) < 0) return(-1) ;
  return(ptr);  // Comments dropped
  }
 while( (ch(ptr) != '>') && !isalpha(ch(ptr))) {
  if(ch(ptr)=='/') 
   var->code = TagClose;  // TagClose name optional
  if((ptr=bump(ptr,q,"Bracket error"))< 0) return(-1); 
  }

 while(isalpha(ch(ptr))) {// collect any name
  *text=ch(ptr);
  text++; 
  if((ptr = bump(ptr,q,"Name error"))<0 acquired="" formal="" if="" name="" nulled="" or="" return="" tag="" text="0;" the="" var-="">name) var->code=TagSingle;
 //printf("\nSymbol %s %d",var->name,strlen(var->name));
 while(ch(ptr) != '>') { // TagSkip to the gt bracket
 // if(ch(ptr) == '/') // we see these at the front and at the end of the <  > pair
 //  var->code=Tagclose;
  if((ptr=bump(ptr,q,"Close error")) <0 a="" adjusting="" ag="" against="" big="" bracket="" check="" chunk="" close="" code="" discovered="" here="" i="" if="" int="" just="" m-="" m="" mainly="" of="" p="tags;" pairs="" past="" points="" ptr="" return="" set_tag_ptr="" supress="" table="" tag="" tags="" the="" we="" word="">code == TagClose || m->code == TagSingle) return(m->code); // Nothing to adjust
 if(m->name) // find the html name in the book
  while(*p->name && strcmp(p->name,m->name) ) p++;
 if(!(*p->name)) {
  printf("\nMissing %s ",m->name); // Not really an error, soem tags just default
  m->code=TagSkip; // Then just dump it, and any enclosed blocks
 }
 else {
  m->code = p->code;// tag modified for text scraping
  //printf(" \nFind %s %d",p->name,p->code);
 }
 return(m->code);
}

// A place to stack up all the pointers marking
// tags in the source
// This is kept static with the source file as
// a step and skip index for the source
typedef struct { 
 int loc;  // index source
 int code;
 int count;  // count of enclosed tags >=1
 int link;    // toparent
 } Stack;
typedef Stack* PStack;
Stack stack[40000];  // All the tags in one page
int intodex=0;  //  ot a rolodex
int outdex=0;
// Simple push stack
int push(int loc,int code,int link) {
 PStack s = stack+intodex;
 s->loc=(int) loc;
 s->code=code;
 intodex++;
 s->link = link;  // the links used elsewhere
 return(intodex);
}
// debug utility for debug
void debug(char * text,PStack ptr) {
 char str[10];
 char * p = (char *) ptr->loc;
 if(ptr->loc)
  while(*p != '<') p--;
 strncpy(str,p,9);
 str[9]=0;
 printf(" %s %d %d %s \n",text,ptr->code,ptr->loc,str);
}
//
// Block procesing done, there is a valid stack
// TagSkip through the stack in DOM order,
// the pointers back to the source take 
// you through each section, to ignore or collect.
// uses relative pointers
int emit(int offset,PInto io) {
 int Itag;
 int Iptr,Iloc;
 int q;
 int output;
 int k;
 Tag t;
 printf("Start Emit %d\n",intodex);
 Iptr=0;
 io->offset=0;
 for(Itag=0;Itag < intodex; Itag++) {
  if(stack[Itag].code == TagDone)
   return(0);
  //printf("Offset %d\n",io->offset);
  if( stack[Itag].code != TagSkip ) {
   Iloc= stack[Itag].loc;
   while(( Iptr >= 0) && 
    (Iloc >= (io->offset+io->bufsize) )  ) Iptr = More(io,"Input flush");
   Iptr= Iloc-io->offset;
   if(Iptr < 0) return(-1);
   //printf("\nEmit %d %d %d |",io->offset,Iptr,Iloc);
   while(io->buf[Iptr] != '<')  {
    fputc(io->buf[Iptr],stdout);
    if((Iptr=bump(Iptr,io,"Emit error"))< 0) return(-1);   
   }
  }
 }
 printf("Emit done");
 return(0);
}

#define Open(code) (code != TagSingle) && (code != TagText) && (code != TagClose) && (code != TagDone)
int process_block(int in) { 
 int i=0;
 int nextcode,newcode;
 PStack ptr;
 //debug("Enter",ptr);
//
// process all blocks  in nested order until parent close
// We just need to mark the residual text
//
 do {
  ptr = stack+in;
  //debug("Sequence",ptr);
  nextcode=(ptr+1)->code;
  if(Open(nextcode))  {// Is following tag an compound  block
   i = process_block(in+1);  // Descend for any nested open
   stack[in].link = i;  // set the skip pointer
   if(i == intodex) {
    //debug("Finished",ptr);
    return(i);
   }
  }
  // Now at this point, pass singletons and skips 
  if(ptr->code != TagSkip && ptr->code != TagSingle) 
   ptr->code = TagEmit;  // This statement is my entire language set, grab plain text
 
  if((ptr->code) == TagDone) {
   //debug("End of blocks",ptr);
   return(in); // done
  }  
  //debug("Peek",ptr);
  if(nextcode == TagClose)   {//Parent is closed
   return(in);  // restart from the open tag
  }
  in++;
 } while(1);
 printf("Fall through Error\n");
 return(in);
}


// A debugger utility
void print_stack(PInto p) {
 int i;
 char text[10];
 printf("stack\n");
 for (i=0;ibuf[stack[i].loc + p->offset],10);
  text[9]=0;
  printf("%d %d |%s|\n", stack[i].loc,stack[i].code,text);
 }
}
//
// write should use relative pointers
// then on read, use the end block to recover relative pointers
//
int write_stack(char * filename) {
  FILE* fout = fopen(filename,"wt");
  int i;
  if(!fout) return(0);
  for(i=0; i < intodex;i++)
   fprintf(fout,"%d %d %d\n", stack[i].loc,stack[i].link,stack[i].code);
  fclose(fout);
  return(i);
  
}
// test utilit for standalone if you remove the no in nomain
// p is a null terminated string of html
char * Tagnames(int id); 
int nomain(char * p,PInto q) {
 int i;
 int src;
 char * start;
 src = 0;  // src is an array index into the inut buf
 start=p;
 start=0; //disabled
 // find the tags, push them onto the stack
 // then process the stack in DOM order
 // them go through the stack and sections of the html
 // that need emitting are marked.
 for(i=0;i < NUMTAGS;i++)
  //printf("Tag: %s %s\n",tags[i].name,Tagnames(tags[i].code));
 while(src >= 0){
  Tag m;
  src = get_tag(&m,src,q); // src now points to close bracket
  //printf("\nTag %s %s src %d off %d", m.name,Tagnames(m.code),src,q->offset);
  if(src>=0) {// at this point, pointer into source need conversion to relative indexing
   push((src+q->offset),set_tag_ptr(&m),0);  // pointer to html source and  code
  }
 }
 printf("Blocks\n");
 push(src,TagDone,0);
 process_block(0);
 if(fseek(q->fin,0, SEEK_SET)) printf("Seek error\n");
 *q->buf = 0;
 q->offset=-10;
 q->count=q->bufsize;
 src=0;
 More(q,"Restart error");
 printf("saught\n");
 write_stack("test.hdx");
 emit(0,q); 
 return(0);
}
#define STANDALONE
#ifdef STANDALONE
int main(int argc, char * args[]) {
 int i;
 Into p;
 PInto q = &p;
 p.fin = fopen("small.html",/*args[1]*/"r");
 if(!p.fin)
  return(1);
 p.buf = malloc(BUFSIZE+1);
 p.bufsize=BUFSIZE;
 p.buf[0]=0;
 p.count = p.bufsize;
 p.offset=-10;
 More(q,"Start error");
 nomain(p.buf,q);
 return(0);}
char * Tagnames(int id) {
 if(id == TagNull) return("Null");
 if(id == TagEmit) return("Emit");
 if(id == TagStop) return("Stop");
 if(id == TagClose) return("Close");
 if(id == TagText) return("Text");
 if(id == TagSingle) return("Single");
 if(id == TagSkip) return("Skip");
 if(id == TagDone) return("Done");
 return("Dunno");
}
#else
#include "cursor.h"
int StepFetchTagSkip(int i, int offset, int method,char * key) {
 char * p;
 char * q;
 Tag t;
 while(i < intodex) {
  if(stack[i].code == TagDone)
   return(-1);
  if(stack[i].code!=TagSkip) {   // we have plain tgext
   p= stack[i].loc  + offset;
   q= stack[i+1].loc + offset;
   do {q--; } while(*q && *q != '<');  // q moved to open of post text bracket
   if(method == Fetch)
    strncpy(key,p,q-p);
   if(method  && Skip) 
    return(stack[i].link);//skip forward 
   if(method && Step)
     return(i+1); 
   return(i+1); // error?
  }
  else
   i++;// look at next tag
 }
 printf("Emit done");
 return(0);
}
// this is the join interface which exports a step and skip 
// functionality for join. This code remains unused when
// compiled as a standalon
//

// All dll attachments export the same two entry point
// Exec and Eval, all the same format, all the same name, I think.
// Scrape does not participate in eval, as near as I can tell
//
__declspec(dllexport) int __cdecl Eval(PCursor self,PCursor other) {
 printf("Eval got called\n");
 return(Null);
}

// 
// The exec uses the stack, already indexed and marked, 
// as a step and skip guide
int InitScraperCursor( PCursor self, void * args[]);
__declspec(dllexport) int __cdecl Exec(PCursor self, int method,void * data) {
 int i= (int)self->current;
 int offset=(int)self->state;  
 printf("Eval got called\n");
 if(stack[i].code == TagDone)
  return(TagDone);
 if(method== Init) {
  printf("Scrape  ");
  InitScraperCursor(self,data);
  printf("Initialize new cursor\n");
 } else
  self->current= (void *)
  StepFetchTagSkip((int)self->current, offset, method,self->element.key);
 return(Null);
}

int InitScraperCursor( PCursor self,void* args[]) {
 int size;
 PInto ptr;
 char * src;
 printf("Init  \n");
 if(!*args) return(1);
 ptr= malloc(sizeof(Into));
 ptr->fin = fopen(*args,"rb");
 if(!ptr->fin) {
  printf("No file\n");
  free(ptr);
  return(1);
 }
 args++; 
 if(!*args) ptr->bufsize= BUFSIZE;
 else ptr->bufsize = atoi(*args); 
 args++;
 if(ptr->bufsize==0) ptr->bufsize = BUFSIZE;  // Zero not allowed
 ptr->buf=malloc(ptr->bufsize);
 *ptr->buf=0;
 self->start = (void *) ptr;
 self->prevOp = ',';
 More(ptr);
 nomain(self->current,ptr);
 return(0);
}
#endif

No comments: