/** @file htmlfind.c
 *  @brief searching and replacing HTML code in HTML files.
 *  @copyright (c) Turku PET Centre
 *  @author Vesa Oikonen
 */
/// @cond
/*****************************************************************************/
#include "tpcclibConfig.h"
/*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <dirent.h>
#include <sys/stat.h>
/*****************************************************************************/
#include "libtpcmisc.h"
/*****************************************************************************/
/* Global variables */
/** Replace mode: 0=only search, 1=find-replace */
int html_replace_mode=0;
/** Search is: 0=case-insestitive, 1=case-sensitive */
int html_case_sensitive=0;
/** Nr of HTML files */
int html_file_nr=0;
/** Nr of matches */
int html_find_nr=0;
/*****************************************************************************/

/*****************************************************************************/
/** User help contents */
static char *info[] = {
  "Program for searching HTML and XHTML code in a specified file or in every",
  "file with extensions *.htm, *.html and *.xhtml that are found under the",
  "specified path. Optionally, the (X)HTML code fragment can be changed to",
  "the given text in every place where it is found.",
  " ",
  "Usage: @P [options] path_or_file search_text",
  " ",
  "Options:",
  " -replace <Substitute text>",
  "     Search text is replaced by specified text in every instance;",
  "     substitute must not contain certain special characters like '&'",
  " -replace-from-file <Filename>",
  "     Search text is replaced in every instance by the contents of",
  "     the specified ASCII text file, including new line characters;",
  "     text length must not exceed 2047 characters",  
  " -case[-sensitive]",
  "     Search is case-sensitive. By default, upper-and lowercase letters",
  "     are considered equal",
  " -stdoptions", // List standard options like --help, -v, etc
  " ",
  "Example 1:",
  "Command for finding out if and where the specified HTML file",
  "contains HTML code fragment '<a href=':",
  "     @P webpage.html \"<a href=\"",
  " ",
  "Example 2:",
  "Command for replacing a web address with another in all HTML",
  "files which can be found in the current directory and below it:",
  "     @P -replace www.new.fi . www.old.fi",
  " ",
  "Alternatively, in bash you can search a string in files using grep",
  "from files with certain extension, for example:",
  "     grep \"string_to_search\" . -R --include \"*.c\" ",
  " ",
  "Keywords: HTML, tools",
  0};
/*****************************************************************************/

/*****************************************************************************/
/* Turn on the globbing of the command line, since it is disabled by default in
   mingw-w64 (_dowildcard=0); in MinGW32 define _CRT_glob instead, if necessary;
   In Unix&Linux wildcard command line processing is enabled by default. */
/*
#undef _CRT_glob
#define _CRT_glob -1
*/
int _dowildcard = -1;
/*****************************************************************************/

/*****************************************************************************/
/** Check if searchstring (uppercase) is found in HTML file.
 *  This depends on global variables html_replace_mode and html_case_sensitive.
\return Returns the number of found instances, or <0 in case of an error.
 */
int html_find(
  /** Pointer to the filename string */
  char *htmlfile, 
  /** Pointer to the search string */
  char *searchstring, 
  /** Pointer to the string which replaces the search string; enter
   *  empty string to just delete the search string */
  char *replacestring,
  /** Verbose level */
  int verbose
) {
  FILE *fpi, *fpo=NULL;
  int c, i, n=0, len, replen=0, line=1, column=0, last_c=0, found_nr=0;
  char tempfile[FILENAME_MAX], *buf;


  /* Check the input */
  if(htmlfile==NULL || searchstring==NULL) return(-1);
  if(strlen(htmlfile)<1) return(-1);
  if(verbose>1) fprintf(stdout, "html_find(%s, %s, %s)\n",
     htmlfile, searchstring, replacestring);

  /* If searchstring was not specified, then filename is shown in any case */
  len=strlen(searchstring);
  if(len<1) {
    if(verbose>=0) fprintf(stdout, "%s (1,1)\n", htmlfile);
    return(0);
  }
  
  /* Allocate memory for the character buffer */
  if(html_replace_mode==1) {
    replen=strlen(replacestring);
    buf=malloc(replen+128); if(buf==NULL) return(-2);
  } else buf=NULL;
  
  
  /*
   *  Try to find the search string
   */
  /* Open the HTML file */
  if((fpi=fopen(htmlfile, "r"))==NULL) {
    if(html_replace_mode==1) free(buf);
    return(-3);
  }
  /* Open the output HTML file */
  if(html_replace_mode==1) {
    strcpy(tempfile, htmlfile); strcat(tempfile, ".bak");
    if((fpo=fopen(tempfile, "w"))==NULL) {
      fclose(fpi); free(buf); return(-4);}
  }
  /* Read the file */
  while((c=fgetc(fpi))!=EOF) {
    if(html_replace_mode==1) buf[n]=c;
    /* ignore end of line characters */
    if(c=='\n' || c=='\r') {
      if(html_replace_mode==1 && n==0) {fputc(c, fpo);}
      column=0;
      if((c=='\n'&&last_c!='\r') || (c=='\r'&&last_c!='\n')) line++;
      last_c=c; continue;
    }
    column++;
    /* convert it to uppercase, if search is case-insensitive */
    if(html_case_sensitive==0) c=toupper(c);
    /* matching character ? */
    if(c==searchstring[n]) {
      n++;
      if(n==len) { /* Match was found */
        if(verbose>=0)
	  fprintf(stdout, "%s (%d,%d)\n", htmlfile, line, column);
        n=0; found_nr++;
        if(html_replace_mode==1) { /* replace */
          i=0;
          while(i<replen)
            if(fputc(replacestring[i++], fpo)==EOF) { /* cannot write */
              fclose(fpi); fclose(fpo); remove(tempfile); free(buf); return(-11);
            }
        }    
      }
    } else { /* No match */
      if(html_replace_mode==1) { /* write buf to temp file */
        i=0;
        while(i<=n) if(fputc(buf[i++], fpo)==EOF) { /* cannot write */
          fclose(fpi); fclose(fpo); remove(tempfile); free(buf); return(-12);}
      }
      n=0;
    }
    last_c=c;
  }
  fclose(fpi);
  
  if(html_replace_mode==1) {
    fclose(fpo); free(buf);
    if(found_nr==0)
      remove(tempfile);
    else {
      remove(htmlfile); rename(tempfile, htmlfile);
      if(verbose>0) printf("%d '%s'(s) substituted with '%s' in %s\n",
        found_nr, searchstring, replacestring, htmlfile);
    }
  }
  return(found_nr);
}
/*****************************************************************************/

/*****************************************************************************/
/** Check if searchstring (uppercase) is found in (X)HTM(L) file(s) that
 *  are found in the specified path including subdirectories.
 *  This depends on global variables html_replace_mode and html_case_sensitive.
\return Returns the number of found instances, or <0 in case of an error.
 */
int html_recursive_find(
  /** Pointer to the search path */
  char *searchpath, 
  /** Pointer to the search string */
  char *searchstring,
  /** Pointer to the string which replaces the search string; enter
   *  empty string to just delete the search string */
  char *replacestring,
  /** Verbose level */
  int verbose
) {
  DIR *dp;
  struct dirent *de;
  struct stat fst;
  char tempname[FILENAME_MAX];
  int ret, n;

  if(verbose>1) printf("html_recursive_find(%s, %s, %s)\n",
    searchpath, searchstring, replacestring);
  /* Check the input */
  if(searchpath==NULL || searchstring==NULL) return(1);
  if(strlen(searchpath)<1) return(1);
  
  /* Check whether searchpath is a directory */
  stat(searchpath, &fst);
  if(S_ISDIR(fst.st_mode)) { /* it is */
    if(verbose>3) printf("  %s is directory\n", searchpath);
    /* Open the directory */
    dp=opendir(searchpath); if(dp==NULL) return(2);
    /* Go throught the directory */
    while((de=readdir(dp))!=NULL) {
      if(verbose>5) printf("d_name='%s'\n", de->d_name);
      if(de->d_name[0]=='.') continue; /* Ignore hidden and 'system' dirs */
      /* Combine path and name */
      sprintf(tempname, "%s/%s", searchpath, de->d_name);
      if(verbose>3) printf("name='%s'\n", tempname);
      /* Go for it (recursively) */
      ret=html_recursive_find(tempname, searchstring, replacestring, verbose);
      if(ret) {closedir(dp); return(ret);}
    }
    closedir(dp);
  } else { /* it is a file */
    /* Check if filename extension is .htm, .html, or .xhtml */
    n=strlen(searchpath);
    if((n>=5 && strcasecmp(searchpath+n-4, ".htm")==0) ||
       (n>=6 && strcasecmp(searchpath+n-5, ".html")==0) ||
       (n>=7 && strcasecmp(searchpath+n-6, ".xhtml")==0)) {
      /* it is, so search/replace in it */
      ret=html_find(searchpath, searchstring, replacestring, verbose);
      if(verbose>8) printf("  html_find(%s, %s)=%d\n",
        tempname, searchstring, ret);
      if(ret>0) {
        html_file_nr++; html_find_nr+=ret;
      }
    }
  }
  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/**
 *  Main
 */
int main(int argc, char *argv[])
{
  int ai, help=0, version=0, verbose=1;
  unsigned int i; 
  int ret;
  char *cptr;
  char searchpath[FILENAME_MAX], searchstring[1024], replacestring[2048];


  /*
   *  Get arguments
   */
  if(argc==1) {tpcPrintUsage(argv[0], info, stderr); return(1);}
  searchpath[0]=searchstring[0]=replacestring[0]=(char)0;
  /* Options */
  for(ai=1; ai<argc; ai++) if(*argv[ai]=='-') {
    if(tpcProcessStdOptions(argv[ai], &help, &version, &verbose)==0) continue;
    cptr=argv[ai]+1;
    if(strcasecmp(cptr, "CASE-INSENSITIVE")==0) {
      html_case_sensitive=0; continue;
    } else if(strncasecmp(cptr, "CASE-SENSITIVE", 4)==0) {
      html_case_sensitive=1; continue;
    } else if(strcasecmp(cptr, "REPLACE")==0) {
      html_replace_mode=1; ai++;
      if(ai<argc) {strcpy(replacestring, argv[ai]); continue;}
    } else if(strcasecmp(cptr, "REPLACE-FROM-FILE")==0) {
      html_replace_mode=1; ai++;
      if(ai<argc) {
        FILE *fp; int c; fp=fopen(argv[ai], "r");
        if(fp!=NULL) {
          if(verbose>1) printf("reading %s\n", argv[ai]);
          i=0; c=fgetc(fp);
          while(c!=EOF && i<2047) {replacestring[i++]=c; c=fgetc(fp);}
          replacestring[i]=(char)0;
          if(feof(fp)==0) {
            fprintf(stderr, "Error: too much contents in %s\n.\n", argv[ai]);
            fclose(fp); return(1);
          }
          fclose(fp); if(i>0) continue;
        }
      }
    }
    fprintf(stderr, "Error: invalid option '%s'\n", argv[ai]);
    return(1);
  } else break;
  
  /* Process other arguments, starting from the first non-option */
  for(; ai<argc; ai++) {
    if(!searchpath[0]) {
      strcpy(searchpath, argv[ai]); continue;
    } else if(!searchstring[0]) {
      strcpy(searchstring, argv[ai]); continue;
    }
    fprintf(stderr, "Error: invalid argument '%s'.\n", argv[ai]);
    return(1);
  }
  /* Print help or version? */
  if(help==2) {tpcHtmlUsage(argv[0], info, ""); return(0);}
  if(help) {tpcPrintUsage(argv[0], info, stdout); return(0);}
  if(version) {tpcPrintBuild(argv[0], stdout); return(0);}
  /* Is something missing? */
  if(!searchstring[0]) {
    tpcPrintUsage(argv[0], info, stdout); return(1);}
  if(html_replace_mode && !replacestring[0]) {
    tpcPrintUsage(argv[0], info, stdout); return(1);}

  /* In verbose mode print arguments and options */
  if(verbose>1) {
    printf("searchpath := %s\n", searchpath);
    printf("searchstring := %s\n", searchstring);
    printf("replacestring := %s\n", replacestring);
    printf("html_replace_mode := %d\n", html_replace_mode);
    printf("html_case_sensitive := %d\n", html_case_sensitive);
  }

  
  /*
   *  Convert the search string to uppercase,
   *  if search is case-insensitive     
   */
  if(html_case_sensitive==0)
    for(i=0; i<strlen(searchstring); i++)
      searchstring[i]=(char)toupper((int)searchstring[i]);

  /*
   *  Search the HTML files recursively for the search string
   */
  ret=html_recursive_find(searchpath, searchstring, replacestring, verbose);
  if(ret) {
    fprintf(stderr, "Error in searching HTML files (%d).\n", ret);
    return(2);
  }
  if(html_find_nr>0)
    fprintf(stdout, "Search text was found %d times in %d file(s).\n",
      html_find_nr, html_file_nr);
  else
    fprintf(stdout, "Search text was not found.\n");

  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/// @endcond
