///////////////////// ///////////////////////////////////////////////////////////////////// /// Version 1.0, production alpha // this code will not change much but some thinning is still required. // Tis code creates pointers back to the original html source, // called the tag index stack.f // As an index, the stack allows join to treat the original html // as a graph database with skip and step functionality on plain text // // use gcc scrape.c // standalone, need to enable the main // // gcc -g -o scrape.dll -s -shared scrape.c -Wl,--subsystem,windows // dll // name a simple html file test.html it produces test.hdx, // and index on the source. // // Contact Matt Young youngsanger@gmail.com for information ///////////////////////////////////////////////////////////////////// #include#include #include #include #include "tags.h" #define BUFSIZE 10// Egad! Need dynamic allocation // A structure for quick and easy buffering // of input from large files typedef struct { char* buf; FILE* fin; int bufsize; int offset; // matching buffer to source int count; } Into; typedef Into * PInto; int msg_exit(PInto io,char * msg) { if(msg) printf("%s\n",msg); io->count=0; return(-1); } // Like the more function on a blog skip break int More(PInto io,char * msg) { if(io->count != io->bufsize) return(msg_exit(io,msg));// Still not done io->count = fread(io->buf,1,io->bufsize,io->fin); if(io->count == EOF || (io->count <=0)) return(msg_exit(io,msg)); io->offset = io->offset + io->bufsize; // the previous buffer was full io->buf[io->count]=0; //printf("%d bytes read start \n",io->count); return(0); // start index } int bump(int p,PInto q,char * m){ p++; if(!q->buf[p]) return(More(q,m)); else return(p); } // All comments dropped // Tag TagDoneTag = {"TagsTagDone",TagDone}; // Special end of stack tag // // All the spaghetti needed to find the tags, their type and symbol here // works directly off the source buffer which is never touched // Get the next tag from a stream of html // ptr is an integer array index into the source #define ch(p) q->buf[ptr] int get_tag(Tag * var,int ptr, PInto q) { char * text = var->name; *text=0; var->code=TagNull; while( ch(ptr) != '<') { //printf("%c",ch(ptr)); if((ptr =bump(ptr,q,"EOF")) < 0) return(-1); } if((ptr=bump(ptr,q,"Open error")) < 0) return(-1); if(ch(ptr)== '!') { // first char in side bracket var->code= TagSingle; var->name[0]=0; // comments have no name while(ch(ptr) != '>') if((ptr=bump(ptr,q,"Comment error")) < 0) return(-1) ; if((ptr=bump(ptr,q,"Comment error")) < 0) return(-1) ; return(ptr); // Comments dropped } while( (ch(ptr) != '>') && !isalpha(ch(ptr))) { if(ch(ptr)=='/') var->code = TagClose; // TagClose name optional if((ptr=bump(ptr,q,"Bracket error"))< 0) return(-1); } while(isalpha(ch(ptr))) {// collect any name *text=ch(ptr); text++; if((ptr = bump(ptr,q,"Name error"))<0 acquired="" formal="" if="" name="" nulled="" or="" return="" tag="" text="0;" the="" var-="">name) var->code=TagSingle; //printf("\nSymbol %s %d",var->name,strlen(var->name)); while(ch(ptr) != '>') { // TagSkip to the gt bracket // if(ch(ptr) == '/') // we see these at the front and at the end of the < > pair // var->code=Tagclose; if((ptr=bump(ptr,q,"Close error")) <0 a="" adjusting="" ag="" against="" big="" bracket="" check="" chunk="" close="" code="" discovered="" here="" i="" if="" int="" just="" m-="" m="" mainly="" of="" p="tags;" pairs="" past="" points="" ptr="" return="" set_tag_ptr="" supress="" table="" tag="" tags="" the="" we="" word="">code == TagClose || m->code == TagSingle) return(m->code); // Nothing to adjust if(m->name) // find the html name in the book while(*p->name && strcmp(p->name,m->name) ) p++; if(!(*p->name)) { printf("\nMissing %s ",m->name); // Not really an error, soem tags just default m->code=TagSkip; // Then just dump it, and any enclosed blocks } else { m->code = p->code;// tag modified for text scraping //printf(" \nFind %s %d",p->name,p->code); } return(m->code); } // A place to stack up all the pointers marking // tags in the source // This is kept static with the source file as // a step and skip index for the source typedef struct { int loc; // index source int code; int count; // count of enclosed tags >=1 int link; // toparent } Stack; typedef Stack* PStack; Stack stack[40000]; // All the tags in one page int intodex=0; // ot a rolodex int outdex=0; // Simple push stack int push(int loc,int code,int link) { PStack s = stack+intodex; s->loc=(int) loc; s->code=code; intodex++; s->link = link; // the links used elsewhere return(intodex); } // debug utility for debug void debug(char * text,PStack ptr) { char str[10]; char * p = (char *) ptr->loc; if(ptr->loc) while(*p != '<') p--; strncpy(str,p,9); str[9]=0; printf(" %s %d %d %s \n",text,ptr->code,ptr->loc,str); } // // Block procesing done, there is a valid stack // TagSkip through the stack in DOM order, // the pointers back to the source take // you through each section, to ignore or collect. // uses relative pointers int emit(int offset,PInto io) { int Itag; int Iptr,Iloc; int q; int output; int k; Tag t; printf("Start Emit %d\n",intodex); Iptr=0; io->offset=0; for(Itag=0;Itag < intodex; Itag++) { if(stack[Itag].code == TagDone) return(0); //printf("Offset %d\n",io->offset); if( stack[Itag].code != TagSkip ) { Iloc= stack[Itag].loc; while(( Iptr >= 0) && (Iloc >= (io->offset+io->bufsize) ) ) Iptr = More(io,"Input flush"); Iptr= Iloc-io->offset; if(Iptr < 0) return(-1); //printf("\nEmit %d %d %d |",io->offset,Iptr,Iloc); while(io->buf[Iptr] != '<') { fputc(io->buf[Iptr],stdout); if((Iptr=bump(Iptr,io,"Emit error"))< 0) return(-1); } } } printf("Emit done"); return(0); } #define Open(code) (code != TagSingle) && (code != TagText) && (code != TagClose) && (code != TagDone) int process_block(int in) { int i=0; int nextcode,newcode; PStack ptr; //debug("Enter",ptr); // // process all blocks in nested order until parent close // We just need to mark the residual text // do { ptr = stack+in; //debug("Sequence",ptr); nextcode=(ptr+1)->code; if(Open(nextcode)) {// Is following tag an compound block i = process_block(in+1); // Descend for any nested open stack[in].link = i; // set the skip pointer if(i == intodex) { //debug("Finished",ptr); return(i); } } // Now at this point, pass singletons and skips if(ptr->code != TagSkip && ptr->code != TagSingle) ptr->code = TagEmit; // This statement is my entire language set, grab plain text if((ptr->code) == TagDone) { //debug("End of blocks",ptr); return(in); // done } //debug("Peek",ptr); if(nextcode == TagClose) {//Parent is closed return(in); // restart from the open tag } in++; } while(1); printf("Fall through Error\n"); return(in); } // A debugger utility void print_stack(PInto p) { int i; char text[10]; printf("stack\n"); for (i=0;i buf[stack[i].loc + p->offset],10); text[9]=0; printf("%d %d |%s|\n", stack[i].loc,stack[i].code,text); } } // // write should use relative pointers // then on read, use the end block to recover relative pointers // int write_stack(char * filename) { FILE* fout = fopen(filename,"wt"); int i; if(!fout) return(0); for(i=0; i < intodex;i++) fprintf(fout,"%d %d %d\n", stack[i].loc,stack[i].link,stack[i].code); fclose(fout); return(i); } // test utilit for standalone if you remove the no in nomain // p is a null terminated string of html char * Tagnames(int id); int nomain(char * p,PInto q) { int i; int src; char * start; src = 0; // src is an array index into the inut buf start=p; start=0; //disabled // find the tags, push them onto the stack // then process the stack in DOM order // them go through the stack and sections of the html // that need emitting are marked. for(i=0;i < NUMTAGS;i++) //printf("Tag: %s %s\n",tags[i].name,Tagnames(tags[i].code)); while(src >= 0){ Tag m; src = get_tag(&m,src,q); // src now points to close bracket //printf("\nTag %s %s src %d off %d", m.name,Tagnames(m.code),src,q->offset); if(src>=0) {// at this point, pointer into source need conversion to relative indexing push((src+q->offset),set_tag_ptr(&m),0); // pointer to html source and code } } printf("Blocks\n"); push(src,TagDone,0); process_block(0); if(fseek(q->fin,0, SEEK_SET)) printf("Seek error\n"); *q->buf = 0; q->offset=-10; q->count=q->bufsize; src=0; More(q,"Restart error"); printf("saught\n"); write_stack("test.hdx"); emit(0,q); return(0); } #define STANDALONE #ifdef STANDALONE int main(int argc, char * args[]) { int i; Into p; PInto q = &p; p.fin = fopen("small.html",/*args[1]*/"r"); if(!p.fin) return(1); p.buf = malloc(BUFSIZE+1); p.bufsize=BUFSIZE; p.buf[0]=0; p.count = p.bufsize; p.offset=-10; More(q,"Start error"); nomain(p.buf,q); return(0);} char * Tagnames(int id) { if(id == TagNull) return("Null"); if(id == TagEmit) return("Emit"); if(id == TagStop) return("Stop"); if(id == TagClose) return("Close"); if(id == TagText) return("Text"); if(id == TagSingle) return("Single"); if(id == TagSkip) return("Skip"); if(id == TagDone) return("Done"); return("Dunno"); } #else #include "cursor.h" int StepFetchTagSkip(int i, int offset, int method,char * key) { char * p; char * q; Tag t; while(i < intodex) { if(stack[i].code == TagDone) return(-1); if(stack[i].code!=TagSkip) { // we have plain tgext p= stack[i].loc + offset; q= stack[i+1].loc + offset; do {q--; } while(*q && *q != '<'); // q moved to open of post text bracket if(method == Fetch) strncpy(key,p,q-p); if(method && Skip) return(stack[i].link);//skip forward if(method && Step) return(i+1); return(i+1); // error? } else i++;// look at next tag } printf("Emit done"); return(0); } // this is the join interface which exports a step and skip // functionality for join. This code remains unused when // compiled as a standalon // // All dll attachments export the same two entry point // Exec and Eval, all the same format, all the same name, I think. // Scrape does not participate in eval, as near as I can tell // __declspec(dllexport) int __cdecl Eval(PCursor self,PCursor other) { printf("Eval got called\n"); return(Null); } // // The exec uses the stack, already indexed and marked, // as a step and skip guide int InitScraperCursor( PCursor self, void * args[]); __declspec(dllexport) int __cdecl Exec(PCursor self, int method,void * data) { int i= (int)self->current; int offset=(int)self->state; printf("Eval got called\n"); if(stack[i].code == TagDone) return(TagDone); if(method== Init) { printf("Scrape "); InitScraperCursor(self,data); printf("Initialize new cursor\n"); } else self->current= (void *) StepFetchTagSkip((int)self->current, offset, method,self->element.key); return(Null); } int InitScraperCursor( PCursor self,void* args[]) { int size; PInto ptr; char * src; printf("Init \n"); if(!*args) return(1); ptr= malloc(sizeof(Into)); ptr->fin = fopen(*args,"rb"); if(!ptr->fin) { printf("No file\n"); free(ptr); return(1); } args++; if(!*args) ptr->bufsize= BUFSIZE; else ptr->bufsize = atoi(*args); args++; if(ptr->bufsize==0) ptr->bufsize = BUFSIZE; // Zero not allowed ptr->buf=malloc(ptr->bufsize); *ptr->buf=0; self->start = (void *) ptr; self->prevOp = ','; More(ptr); nomain(self->current,ptr); return(0); } #endif 0>0>
Web text scraper, dll option with improved io, relative indexing
Complete web page text scrapeR. Add this link for Tags.c.
Subscribe to:
Posts (Atom)
No comments:
Post a Comment