#include #include #include #include #define MAX_STRING 1024 #define MAX_REF 1024 #define debug 0 typedef struct char absURL[MAX_STRING]; int transfer_type; int location; int file_size; int num_out_links; int num_internal_links; int num_in_links; } URLinfo; int readPage(int sock, char **page, int bufsize); int fillHREF( URLinfo href[], char* tempURL, int hrefNum, char* host, int currrentHREF); /* ------------------------------------------------------------------ main(): sent request to get pages recursively and display information regarding links on each page. argument: argc - number of arguments on the command line argv - the arguments on the command line ------------------------------------------------------------------ */ void main(int argc, char* argv[]) { char *host = "www2.sis.pitt.edu"; /* default host */ char *startpage = "/index.html"; /* default starting page */ switch(argc) { case 3: /* host name and starting page file provided */ startpage = argv[2]; case 2: host = argv[1]; break; case 1: default:/* use default */ fprintf(stderr, "usage: spider [webserver [start page]]\n"); fprintf(stderr, "searching %s starting at %s\n", host, startpage); } Spider(host, startpage); } /* -------------------------------------------------------------------- Spider(): connect to the host and sent request to get pages recursively and display information regarding links on each page. Arguments: host - the web server startpage - The first page the spider request from the host Return: number of links on the page or -1 if failed --------------------------------------------------------------------- */ int Spider(char * host, char * startpage){ /* array of struct URLinfo: to hold all hrefs */ URLinfo href[MAX_REF]; char tempURL[MAX_STRING]; /* store a URL temporary */ char *startRef, *stopRef; /* pointer to the begining and end of a url */ char * startrefuc; /* pointer to the begining of an upercase "HREF" */ char* startreflc; /* pointer to the begining of an upercase "HREF" */ char *page; /* a pointer to char for our web page: dynamically allocated should be freed at the end */ int sock; int bufsize = 9600; /* initial size of the buffer to store the page */ int pageSize; /* the size of the page got back from the web server */ int hrefNum = 0; /*number of total links*/ int start_hrefnum; /*number of links before getting a new page*/ int currentHREF = 0; int internal; int i; char requestBuf[MAX_STRING]; memset (href, 0, sizeof(href)); /* construct a request: HERE IS a typical header from a client -- in this case netscape. all that is actually needed is the first line followed by two sequences -- that is GET FILENAME HTTP/1.0 Connection: Keep-Alive User-Agent: Mozilla/4.04 [en] (X11; I; SunOS 5.6 sun4u) Pragma: no-cache Host: icarus.sis.pitt.edu:5127 Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg Accept-Language: en Accept-Charset: iso-8859-1,*,utf-8 */ /* record the starting page in href[0] */ sprintf(href[0].absURL,"http://%s%s", host, startpage ); href[0].location = 0; href[0].transfer_type =1; hrefNum++; /*allocate space for page dynamically */ page =(char*) malloc(bufsize); /*loop through href array */ while((hrefNum startrefuc) ? startrefuc : startreflc; startRef +=4; /* move pass "HREF" or"href" */ memset(tempURL, 0, sizeof(tempURL)); /* move startRef to point to the actual URL */ while ( ( *startRef ==' ' ) || ( *startRef =='=') || \ ( *startRef =='"' )||( *startRef=='\t' )|| \ ( *startRef =='\n') ){ startRef++; } /*find the first character following the actual URL which is one of the character in 2nd argument*/ stopRef = strpbrk(startRef, " \"\t\n"); strncpy(tempURL, startRef, stopRef-startRef); if (tempURL[0] == '#') { printf("Internal link: %s\n", tempURL); internal++; continue; } /* fill information on this url in href struct and increment the hrefNum if it is a new link*/ hrefNum += fillHREF(href, tempURL, hrefNum, host, currentHREF); }/* end of while((hrefnum0) { totalRead+=amt_read; if (totalRead > (bufsize/2)){ if ((*pPage = (char *)realloc(*pPage, bufsize * 2))==NULL) { printf("Memory allocation failure \n"); return -1; } bufsize *= 2; memset(*pPage+totalRead, 0, bufsize-totalRead); } } if (debug) printf("Total read = %d; Last read = %d \n", totalRead, amt_read); if (amt_read==0) return totalRead; else return -1; } /* ------------------------------------------------------------------------ fillHREF(): fill the href struct if it is a new structure arguments: href[] - array of URLinfo struct, tempURL - the URL parsed from the page hrefNum - the number of href in the array currently host - host name currrentHREF - the array index of href working on return : 1 if tempURL is a new link, else 0. ----------------------------------------------------------------------- */ int fillHREF( URLinfo href[], char* tempURL, int hrefNum, char* host, int currentHREF) { int transfer_type, location; char * startpage; char * endDir; char URL[MAX_STRING]; /* normalized url */ int i; memset(URL, 0, sizeof(URL)); startpage = href[0].absURL; if (strstr(tempURL, "://")!=NULL) {/* absolute path */ /* determine transfer type */ if(strstr(tempURL, "http://")!=NULL) {/* http type */ transfer_type = 0; } else if ( strstr(tempURL, "ftp://")!=NULL ) { /*ftp type*/ transfer_type = 1; } else { /* other type */ transfer_type = 2; } /* determine location */ if (strstr(tempURL, host)) {/* local */ location=0; } else{/* external */ location=1; } /* copy tempURL to URL */ strcpy(URL, tempURL); } else { /* tempURL is a local, relative path */ location = 0; transfer_type = 0; if (href[currentHREF].absURL[strlen(href[currentHREF].absURL)]=='/'){ /* a directory */ strcpy(URL, href[currentHREF].absURL); /* strcat(URL, "/"); */ strcat(URL, tempURL); } else{ /* a relative path with file name */ endDir = strrchr(href[currentHREF].absURL,'/'); strncpy(URL, href[currentHREF].absURL, endDir-href[currentHREF].absURL); if( tempURL[0]!='/' ) { /* if url doen't starts with '/' */ strcat(URL, "/"); } strcat(URL, tempURL); } } for (i=0;i < hrefNum; i++){ if (strcmp( href[i].absURL, URL)==0) {/* existing */ href[i].num_in_links++; break; } } if (i < hrefNum)/* not a new link */ return 0; /* now URL is a new, absolute URL. copy to href */ strcpy(href[hrefNum].absURL, URL); href[hrefNum].transfer_type = transfer_type; href[hrefNum].location = location; printf("%4d. %s\n", hrefNum, href[hrefNum].absURL); return 1; }